
    rhb                       d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+  e&       rddl,m-Z- ddl.m/Z/  e)j`                  e1      Z2 G d dejf                        Z4 G d dejf                        Z5 G d dejf                        Z6 G d dejf                        Z7 G d dejf                        Z8 G d dejf                        Z9 G d d ejf                        Z: G d! d"e      Z; G d# d$ejf                        Z<e% G d% d&e!             Z= G d' d(e=      Z>e% G d) d*e=             Z? e%d+,       G d- d.e=e             Z@e% G d/ d0e=             ZA e%d1,       G d2 d3e=             ZBe% G d4 d5e=             ZCe% G d6 d7e=             ZDg d8ZEy)9zPyTorch UMT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )UMT5LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/umt5/modeling_umt5.pyr(   zUMT5LayerNorm.__init__>   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor*   float32powmeanrsqrtr-   r,   dtypefloat16bfloat16)r.   hidden_statesvariances      r2   forwardzUMT5LayerNorm.forwardF   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r3   )gư>)__name__
__module____qualname__r(   rB   __classcell__r1   s   @r2   r%   r%   =   s    $+r3   r%   c                   *     e Zd Zdef fdZd Z xZS )UMT5DenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r'   r(   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr
   dense_act_fnactr.   rJ   r1   s     r2   r(   zUMT5DenseActDense.__init__X   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r3   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)rR   rX   rV   
isinstancerS   r,   r*   Tensorr=   int8r8   r.   r@   s     r2   rB   zUMT5DenseActDense.forward_   s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r3   rC   rD   rE   r!   r(   rB   rF   rG   s   @r2   rI   rI   W   s    /z /r3   rI   c                   *     e Zd Zdef fdZd Z xZS )UMT5DenseGatedActDenserJ   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rL   )r'   r(   r   rO   rP   rQ   wi_0wi_1rS   rT   rU   rV   r
   rW   rX   rY   s     r2   r(   zUMT5DenseGatedActDense.__init__o   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r3   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S r[   )rX   rd   re   rV   r\   rS   r,   r*   r]   r=   r^   r8   )r.   r@   hidden_geluhidden_linears       r2   rB   zUMT5DenseGatedActDense.forwardw   s    hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r3   r`   rG   s   @r2   rb   rb   n   s    /z /r3   rb   c                   *     e Zd Zdef fdZd Z xZS )UMT5LayerFFrJ   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr0   )r'   r(   is_gated_actrb   DenseReluDenserI   r%   rP   layer_norm_epsilon
layer_normr   rT   rU   rV   rY   s     r2   r(   zUMT5LayerFF.__init__   s_    "8"@D"3F";D'F<U<UVzz&"5"56r3   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S r[   )rp   rn   rV   )r.   r@   forwarded_statess      r2   rB   zUMT5LayerFF.forward   s=    ??=9../?@%5E(FFr3   r`   rG   s   @r2   rj   rj      s    7z 7r3   rj   c                   >    e Zd ZdZddee   f fdZdej                  dej                  fdZ	d Z
ddZ	 	 	 	 	 dd	ej                  d
eej                     deeej                        deej                     deej                     deej                     fdZ xZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r/t'        j2                  | j                  | j                        | _        t7               | _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrM   )r'   r(   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerP   d_kvkey_value_proj_dim	num_headsn_headsrU   rV   	inner_dimru   loggerwarning_oncer1   rC   r   rO   qkvo	Embeddingrelative_attention_biassetpruned_heads)r.   rJ   rx   ru   r1   s       r2   r(   zUMT5Attention.__init__   si    +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(Er3   
projectionreturnc                     |j                         d d | j                  | j                  fz   }|j                  |      j	                  dddd      }|S )Nr6   r   r5   r    r	   )sizer~   r|   viewpermute)r.   r   new_projection_shapenew_projections       r2   _shapezUMT5Attention._shape   sQ    )0"5tG^G^8__#)=>FFq!QPQRr3   c                    d}| j                   }| j                  }| j                  sC|dz  }||dkD  j                  t        j
                        |z  z  }t	        j                  |      }n*t	        j                  |t	        j                  |             }|dz  }||k  }t	        j                  |j                         |z        t        j                  ||z        z  }|||z
  z  }||j                  t        j
                        z   }t	        j                  |t	        j                  ||dz
              }|t	        j                  |||      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r5   r    )ry   rz   rw   r8   r*   longabsmin
zeros_likelogfloatmath	full_likewhere)	r.   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larges	            r2   _relative_position_bucketz'UMT5Attention._relative_position_bucket   sA   * 99;;AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 II/557)CDtxxP\_hPhGii	y!89	%.ejj1I%I"%*YY&8RT_bcTc(d&
" 	EKK2CE_``r3   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n	|dddf   }t        j                  |t        j
                  |      dddf   }||z
  }| j                  |      }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r=   device)r5   r   r    r   )	r   r,   r   r*   aranger   r   r   	unsqueeze)
r.   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r2   compute_biaszUMT5Attention.compute_bias   s    >1188??F!$||L

SYZ[\^b[bc-ag6,,zFSTXZ[T[\+.>>#'#A#ABS#T --.FG	*44Q7r3   r@   encoder_hidden_statespast_key_valueattention_masklayer_head_maskr   c                    |j                   d d \  }}|d u}	| j                  |      }
|
j                  |d| j                  | j                        j                  dd      }
|Qt        |t              rA|j                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|D|	s|nd }|j%                  ||| j                  d|i      \  }}|	rd|j                  | j                  <   t'        j(                  |
|j                  dd            }|||j+                         z   n|}|j                   d   }| j,                  s;t'        j.                  d| j                  ||f|j0                  |j2                        }n1| j5                  |||j0                  |	      }|d d d d | d d d f   }|#|d d d d d d d |j                   d   f   }||z   }| j6                  rRt'        j8                  |j                   d         }d
|t;        | j6                        <   |d d |j=                         f   }n|}||z  }t>        j@                  jC                  |jE                         d      jG                  |      }t>        j@                  jI                  || jH                  | jJ                        }|||z  }t'        j(                  ||      }|j                  dd      jM                         }|j                  ||d      }| jO                  |      }||fS )Nr5   r6   r    r   Tr	   )r   r=   )r   r   r   dim)ptraining)(shaper   r   r~   r|   	transposer\   r   
is_updatedgetru   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater*   matmulget_seq_lengthrx   zerosr   r=   r   r   r+   listboolr   
functionalsoftmaxr   type_asrV   r   
contiguousr   )r.   r@   r   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskmaskposition_bias_maskedattn_weightsattn_outputs                            r2   rB   zUMT5Attention.forward  s    "/!4!4Ra!8
J 3$>vvm,#((RtG^G^_iijkmno %*^EX*Y'2266t~~FJ!&4&J&J#&4&I&I#"02D.-."<,33DNNCHHJ.55dnnELLL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG KYJd*~'D'D'FFjt%%b)
//!KKDLL*j9&--W]WcWcM !--FMMR` . M *!Qa*?@M%(Aq2HJ4D4DR4H2H)HIK)K7M::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z:rBff[)L((r3   )FN)NNNNNNN)rC   rD   rE   __doc__r   intr(   r*   r]   r   r   r   tuplerB   rF   rG   s   @r2   rt   rt      s    "XVY] ": %,, - ^$ 9=8<152615\)||\)  (5\) !u||!45	\)
 !.\) "%,,/\) !.\)r3   rt   c                   <     e Zd Zddee   f fdZ	 	 	 	 ddZ xZS )UMT5LayerSelfAttentionru   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NTrx   ru   rl   )r'   r(   rt   SelfAttentionr%   rP   ro   rp   r   rT   rU   rV   r.   rJ   ru   r1   s      r2   r(   zUMT5LayerSelfAttention.__init__d  sN    *6t_hi'F<U<UVzz&"5"56r3   c                     | j                  |      }| j                  |||||      }|| j                  |d         z   }|f|dd  z   }|S )Nr   r   r   r   r   r    )rp   r   rV   )	r.   r@   r   r   r   r   normed_hidden_statesattention_outputoutputss	            r2   rB   zUMT5LayerSelfAttention.forwardj  sm      $}=-- )+)) . 
 &5Ea5H(II "%5ab%99r3   r[   )NNNNrC   rD   rE   r   r   r(   rB   rF   rG   s   @r2   r   r   c  s&    7(3- 7 r3   r   c                   >     e Zd Zddee   f fdZ	 	 	 	 	 ddZ xZS )UMT5LayerCrossAttentionru   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr   rl   )r'   r(   rt   EncDecAttentionr%   rP   ro   rp   r   rT   rU   rV   r   s      r2   r(   z UMT5LayerCrossAttention.__init__  sO    ,VQVbkl'F<U<UVzz&"5"56r3   c                     | j                  |      }| j                  ||||||      }|| j                  |d         z   }	|	f|dd  z   }
|
S )Nr   r   r   r   r   r   r    )rp   r   rV   )r.   r@   r   r   r   r   r   r   r   layer_outputr   s              r2   rB   zUMT5LayerCrossAttention.forward  so      $}=// "7)+)) 0 
 %t||4DQ4G'HH/$4QR$88r3   r[   r   r   rG   s   @r2   r   r     s)    7(3- 7 #r3   r   c                   F     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 	 ddZ xZS )	UMT5Blockru   c                 n   t         |           |j                  | _        t        j                         | _        | j
                  j                  t        ||             | j                  r&| j
                  j                  t        ||             | j
                  j                  t        |             y )Nru   )
r'   r(   rw   r   
ModuleListlayerappendr   r   rj   r   s      r2   r(   zUMT5Block.__init__  sz     ++]]_


09MN??JJ5f	RS

+f-.r3   c                     | j                   d   |||||
      \  }}|j                  t        j                  k(  r}t        j                  |j                        j
                  }t        j                  t        j                  |      j                         |dz
  |      }t        j                  || |      }d }| j                  xr |d u}|r | j                   d   ||||||
      \  }}|j                  t        j                  k(  r}t        j                  |j                        j
                  }t        j                  t        j                  |      j                         |dz
  |      }t        j                  || |      } | j                   d   |      }|j                  t        j                  k(  r}t        j                  |j                        j
                  }t        j                  t        j                  |      j                         |dz
  |      }t        j                  || |      }|f}|	r|||fz  }|S )Nr   r   i  )r   maxr    r   r6   )r   r=   r*   r>   finfor   r   isinfanyclamprw   )r.   r@   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   s                    r2   rB   zUMT5Block.forward  s    ,94::a=)+)),
(( %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KKK<[YM "!__R1Fd1R0=

1&;5 :--1-M- ""emm3!KK(;(;<@@	#kk%++m*D*H*H*JIX\L\^gh %M|Q\ ] '

2}5 %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KKK<[YM ")+=>>Gr3   r[   )	NNNNNNFFNr   rG   s   @r2   r   r     s5    /(3- / "##';r3   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )UMT5ClassificationHeadz-Head for sentence-level classification tasks.rJ   c                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y )N)r   )r'   r(   r   rO   rP   denserT   classifier_dropoutrV   
num_labelsout_projrY   s     r2   r(   zUMT5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr3   r@   r   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r[   )rV   r  r*   tanhr  r_   s     r2   rB   zUMT5ClassificationHead.forward  sN    ]3

=1

=1]3m4r3   )
rC   rD   rE   r   r!   r(   r*   r]   rB   rF   rG   s   @r2   r  r    s/    7Ez EU\\ ell r3   r  c                   L    e Zd ZU eed<   dZdZdZdgZdgZ	e
d        Zd Zd Zy	)
UMT5PreTrainedModelrJ   transformerTr   rS   c                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r*   tensorr   r   )r.   r  
input_maskdummy_inputss       r2   r  z UMT5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r3   c                 x   | j                   j                  }t        |t              r)|j                  j
                  j                  |dz         y	t        |t        t        t        t        f      r|j                  j                  j
                  j                  d|dz         t        |d      rJ| j                   j                  s4|j                  j                  j
                  j                  d|dz         t        |d      rz|j                   j                  j
                  j                  d|| j                   j"                  dz  z         |j                   j$                  j
                  j'                          y	y	t        |t(              rpt        |d      rc|j*                  j                  j
                  j                  d|dz         |j*                  j$                  j
                  j'                          y	y	t        |t,              rM|j.                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j.                  d      rD|j.                  j$                  .|j.                  j$                  j
                  j'                          |j0                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j0                  d      rF|j0                  j$                  /|j0                  j$                  j
                  j'                          y	y	y	t        |t2              rM|j4                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j4                  d      rD|j4                  j$                  .|j4                  j$                  j
                  j'                          |j6                  j                  j
                  j                  d|| j                   j8                  dz  z         t        |j6                  d      rF|j6                  j$                  /|j6                  j$                  j
                  j'                          y	y	y	t        |t:              r|j<                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j<                  d      rD|j<                  j$                  .|j<                  j$                  j
                  j'                          |j>                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j>                  d      rD|j>                  j$                  .|j>                  j$                  j
                  j'                          |j6                  j                  j
                  j                  d|| j                   j8                  dz  z         t        |j6                  d      rF|j6                  j$                  /|j6                  j$                  j
                  j'                          y	y	y	t        |t@              ri| j                   j"                  }| j                   jB                  }| j                   jD                  }|jF                  j                  j
                  j                  d|||z  dz  z         |jH                  j                  j
                  j                  d||dz  z         |jJ                  j                  j
                  j                  d||dz  z         |jL                  j                  j
                  j                  d|||z  dz  z         |jN                  r8|jP                  j                  j
                  j                  d||dz  z         y	y	y	)
zInitialize the weights      ?        )r;   stdlm_head
qa_outputs      
classifierrN   N))rJ   initializer_factorr\   r%   r,   datafill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharednormal_hasattrtie_word_embeddingsr"  r#  rP   rN   zero_UMT5ForTokenClassificationr%  r  r  r  rI   rR   rS   rQ   rb   rd   re   rt   r{   r}   r   r   r   r   rx   r   )r.   modulefactorrP   r|   r~   s         r2   _init_weightsz!UMT5PreTrainedModel._init_weights  s   //fm,MM$$Vc\2, (	
 MM  %%--3FSL-Ivy)$++2Q2Q%%**22#2Nv|,!!((--553Ft{{ObObgkNkDl5m!!&&++113 -  :;v|,!!((--553FSL5Q!!&&++113 -  67LL$$,,#6dkkFYFY^bEb;c,dv||V,1B1B1N!!&&,,.OO""''//SfI\I\aeHe>f/gv/FOO4H4H4T$$))//1 5U/ 12 II!!))s4;;CVCV[_B_8`)avyy&)fiinn.H		##))+II!!))s4;;CSCSX\B\8])^vyy&)fiinn.H		##))+ /I) 67KK##++&T[[EXEX]aDa:b+cv{{F+0@0@0L  %%++-KK##++&T[[EXEX]aDa:b+cv{{F+0@0@0L  %%++-II!!))s4;;CSCSX\B\8])^vyy&)fiinn.H		##))+ /I). kk))G!%!1!1kk++GHHOO  ((cv'L^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cv'L^B^cgAg7h(i11..55::BBQW\chl[lQmBn 2 /r3   c                    | j                   j                  }| j                   j                  }|t        d      t	        |      rGt        j                  |j                  d d dz   |      }t        j                  ||dd df   gd      }n>|j                  |j                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |d	k(  |       |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r6   )r    .r   r    ).r   z1self.model.config.pad_token_id has to be defined.)rJ   decoder_start_token_idpad_token_id
ValueErrorr   r*   fullr   cat	new_zerosclonemasked_fill_)r.   r  r8  r9  shifted_input_idss        r2   _shift_rightz UMT5PreTrainedModel._shift_rightP  s    !%!C!C{{//!)6  Y' %

9??3B+?$+FH^ _ %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r3   N)rC   rD   rE   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr  r5  rA   r3   r2   r  r    sI    %&*#!$!F @oD!r3   r  c                       e Zd Zd fd	Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej                  df   dej                  dej                  de	d	e
f
d
Zedej                  dededej                  dej                  defd       Z xZS )	UMT5Stackc           	         t         |   |       || _        |j                  | _        t	        j
                  t        |j                        D cg c]  }t        ||       c}      | _	        t        |j                  |j                        | _        t	        j                  |j                        | _        d| _        | j%                          y c c}w )Nr   rl   F)r'   r(   embed_tokensrw   r   r   range
num_layersr   blockr%   rP   ro   final_layer_normrT   rU   rV   gradient_checkpointing	post_init)r.   rJ   rM  ir1   s       r2   r(   zUMT5Stack.__init__m  s     ( ++]]ERXRcRcLd#eqIf$B#ef
 -fnn&B[B[ \zz&"5"56 ',# $fs   Cc                     || _         y r[   )rM  r.   new_embeddingss     r2   set_input_embeddingszUMT5Stack.set_input_embeddingsy  s
    *r3   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|	rt        j                  d
       d}	|(| j                  t        d      | j                  |      }|\  }}|	du r| j
                  st        d|  d      | j
                  rB|	rN|L| j                   j                  rt        t!               t!                     }nt!               }n| j
                  sd }||j#                         nd}|%t%        j&                  |||z   |j(                        }|1t+               s'||z   }t%        j,                  |||j(                        }| j
                  r2| j/                  |||t1        |t              r|j2                  n||
      }n]|Y|d d d d d d f   }|j5                  |j6                        }d|z
  t%        j8                  |j6                        j:                  z  }nd }| j
                  rO|M|j                         \  }}}||f}|!t%        j,                  ||j(                        }| j=                  |      }nd }| j?                  || j                   j@                        }| j?                  || j                   j@                        }|rdnd }|
rdnd }|
r| j
                  rdnd }| jC                  |      }tE        | jF                        D ]P  \  }}||   } ||   }!|r||fz   } |||||| |!||	|
|
      }"|"d   }|
s2||"d   fz  }| j
                  sH||"d   fz  }R | jI                  |      }| jC                  |      }|r||fz   }|stK        d |||||fD              S tM        |||||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer6   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr   r   )r=   r  rI  )r  r   r  r   r  r  r   r    r5   c              3   $   K   | ]  }|| 
 y wr[   rI  ).0r   s     r2   	<genexpr>z$UMT5Stack.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_statepast_key_valuesr@   
attentionscross_attentions)'rJ   r  r  output_hidden_statesuse_return_dictrw   r:  r   r   rR  r   r   r   rM  is_encoder_decoderr   r   r   r*   r   r   r   r+   _update_causal_maskr\   r   r8   r=   r   r   invert_attention_maskget_head_maskrO  rV   	enumeraterP  rQ  r   r   )#r.   r  r   r   r  r\  	head_maskcross_attn_head_maskrb  r  r  re  return_dictr   err_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr@   rT  layer_moduler   r  layer_outputss#                                      r2   rB   zUMT5Stack.forward|  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii ??_4;;11&9,.,.&YO&2nO #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D4zAO"ZZ
OML`L`aN??22o/BC  44$!K '(D$)9:K%..}/B/B.CK,M<O<O0P0T0TTKK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y$++2H2HI	#112FH^H^_"6BD0d%64??rPT]3(4 	@OA|'lO)=a)@&#$58H$H!(%'F /+E.#"3-M *!,M =#3"55??(]1-=,??(3	@6 --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r3   r   r"   input_tensorr   rb  r  c           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r\  rq  is_trainingr    r6   )sequence_lengthtarget_lengthr=   r   r   )cudaxpunpu)rJ   _attn_implementationr   r\   r*   r]   r#   r   is_compileabler   _ignore_causal_mask_sdpar   r=   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r.   r   r}  r   rb  r  past_seen_tokensusing_compilable_cacher=   r  r  r   	min_dtypes                r2   rh  zUMT5Stack._update_causal_mask&  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr3   r  r  r=   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer=   r   r    )diagonalr]  r6   r   )r   r*   r   r   r;  r   triur   reshapeexpandr>  r   r8   masked_fill)r   r  r  r=   r   r   kwargsr   r  mask_lengthpadding_masks              r2   r  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionj  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r3   r[   )NNNNNNNNNNNNN)F)rC   rD   rE   r(   rX  rB   r   r*   r]   r   r   rh  staticmethodr   r=   r  rF   rG   s   @r2   rK  rK  l  s    
+
 "#!!g
` #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r3   rK  c            &       B    e Zd ZU dZdZeed<   ddgZ fdZd Z	d Z
d	 Zd
 Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     deej&                     deej$                     deej(                     deej&                     deej&                     deej*                     deeeej&                           dee   deej*                     deej*                     dee   dee   dee   dee   deej$                     deeej&                     ef   f"d       Z xZS ) r)  ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5rJ   encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        d|_        t        || j                        | _        t        j                  |      }d|_	        d|_        |j                  |_        t        || j                        | _        | j#                          y NFT)r'   r(   r   r   
vocab_sizerP   r-  copydeepcopyrw   r  tie_encoder_decoderrK  encodernum_decoder_layersrO  decoderrS  r.   rJ   encoder_configdecoder_configr1   s       r2   r(   zUMT5Model.__init__  s     ll6#4#4fnnEv.$)!#( -2* =v.$(!-2*$*$=$=! = 	r3   c                     | j                   S r[   r-  r.   s    r2   get_input_embeddingszUMT5Model.get_input_embeddings      {{r3   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r[   r-  r  rX  r  rV  s     r2   rX  zUMT5Model.set_input_embeddings  -    $)).9)).9r3   c                     | j                   j                  ra| j                  | j                  j                  | j
                         | j                  | j                  j                  | j
                         y y r[   rJ   r0  _tie_or_clone_weightsr  rM  r-  r  r  s    r2   _tie_weightszUMT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r3   c                     | j                   S r[   r  r  s    r2   get_encoderzUMT5Model.get_encoder      ||r3   c                     | j                   S r[   r  r  s    r2   get_decoderzUMT5Model.get_decoder  r  r3   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   	attentionprune_headsr.   heads_to_pruner   headss       r2   _prune_headszUMT5Model._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr3   r  r   r  r  rl  decoder_head_maskrm  encoder_outputsrb  r\  decoder_inputs_embedsr  r  re  rn  r   r   c                 L   ||n| j                   j                  }||n| j                   j                  }|| j                  |||
||||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  ||||	|||||||||      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )	a+  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r   r\  rl  r  re  rn  r   r    r5   ra  r@   rc  r  r   r\  rb  r   r  rl  rm  r  r  re  rn  r   )ra  rb  decoder_hidden_statesdecoder_attentionsrd  encoder_last_hidden_stater   encoder_attentions)rJ   r  rf  r  r\   r   lenr  r   ra  rb  r@   rc  rd  )r.   r  r   r  r  rl  r  rm  r  rb  r\  r  r  r  re  rn  r   r@   decoder_outputss                      r2   rB   zUMT5Model.forward  s[   b "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r3   NNNNNNNNNNNNNNNN)rC   rD   rE   r   
model_typer!   rB  _tied_weights_keysr(   r  rX  r  r  r  r  r   r   r*   
LongTensorFloatTensor
BoolTensorr]   r   r   r   r   r   rB   rF   rG   s   @r2   r)  r)    s   " J79VW(:OC  156:8<=A159=7;EI+/048<$(,0/3&*59#D
E,,-D
 !!2!23D
 $E$4$45	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%D
  -D
  (5D
 D>D
 $D>D
 'tnD
  d^!D
" !!1!12#D
$ 
uU&&');;	<%D
 D
r3   r)  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc            (       p    e Zd ZdZdZg dZ fdZd Zd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                      deej                     deej"                     deej                      deej                      deej$                     deeeej$                           dee   deej                      deej                      deej                     dee   dee   dee   dee   deej                     deeej                      ef   f$d       Zdej$                  fdZ xZS )r*  a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  )r  r  zlm_head.weightc                 N   t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        d|_        t        || j                        | _        t        j                  |      }d|_
        d|_        |j                  |_        t        || j                        | _        t	        j$                  |j                  |j                  d      | _        | j)                          y )NFTrM   )r'   r(   rP   	model_dimr   r   r  r-  r  r  rw   r  r  rK  r  r  rO  r  rO   r"  rS  r  s       r2   r(   z%UMT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( -2* =v.$(!-2*$*$=$=! =yy1B1BO 	r3   c                     | j                   S r[   r  r  s    r2   r  z1UMT5ForConditionalGeneration.get_input_embeddings  r  r3   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r[   r  rV  s     r2   rX  z1UMT5ForConditionalGeneration.set_input_embeddings  r  r3   c                     | j                   j                  ra| j                  | j                  j                  | j
                         | j                  | j                  j                  | j
                         y y r[   r  r  s    r2   r  z)UMT5ForConditionalGeneration._tie_weights  r  r3   c                     | j                   S r[   r  r  s    r2   r  z(UMT5ForConditionalGeneration.get_encoder  r  r3   c                     | j                   S r[   r  r  s    r2   r  z(UMT5ForConditionalGeneration.get_decoder  r  r3   r  r   r  r  rl  r  rm  r  rb  r\  r  labelsr  r  re  rn  r   r   c                    ||n| j                   j                  }||n| j                   j                  }|| j                  |||
||||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|||| j                  |      }| j                  ||||	|||||||||      }|d   }| j                   j                  r|| j                  dz  z  }| j                  |      }d}|^t        d	
      }|j                  |j                        } ||j                  d|j!                  d            |j                  d            }|s|f|dd z   |z   }||f|z   S |S t#        |||j$                  |j&                  |j(                  |j*                  |j,                  |j&                  |j(                  	      S )aK  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr  r   r    r5   r  r  r$  r7  ignore_indexr6   	losslogitsrb  r  r  rd  r  r   r  )rJ   r  rf  r  r\   r   r  rA  r  r0  r  r"  r   r8   r   r   r   r   rb  r@   rc  rd  ra  )r.   r  r   r  r  rl  r  rm  r  rb  r\  r  r  r  r  re  rn  r   r@   r  sequence_output	lm_logitsr  loss_fctoutputs                            r2   rB   z$UMT5ForConditionalGeneration.forward  s+   j "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r3   c                 $    | j                  |      S r[   )rA  )r.   r  s     r2   %prepare_decoder_input_ids_from_labelszBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsa  s      ((r3   )NNNNNNNNNNNNNNNNN)rC   rD   rE   r   r  r  r(   r  rX  r  r  r  r   r   r*   r  r  r  r]   r   r   r   r   r   rB   r  rF   rG   s   @r2   r*  r*  v  s     Ji0:O  156:8<=A159=7;@D+/59=A-1$(,0/3&*59%_
E,,-_
 !!2!23_
 $E$4$45	_

 !))9)9 :_
 E--._
 $E$5$56_
 'u||4_
 "%ell(;"<=_
 "%_
   1 12_
  ((9(9:_
 ))*_
 D>_
 $D>_
  'tn!_
" d^#_
$ !!1!12%_
& 
uU&&'8	9'_
 _
D)ELL )r3   r*  c                   *    e Zd ZdZdZdgZ fdZd Zd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 dd
eej                     deej                      deej                      deej                      dee   dee   dee   deeej                      ef   fd       Z xZS )r+  a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        t        || j                        | _        | j                          y NF)r'   r(   r   r   r  rP   r-  r  r  r  rg  rK  r  rS  )r.   rJ   r  r1   s      r2   r(   zUMT5EncoderModel.__init__y  sh     ll6#4#4fnnEv.#( ,1) = 	r3   c                     | j                   S r[   r  r  s    r2   r  z%UMT5EncoderModel.get_input_embeddings  r  r3   c                 H    || _         | j                  j                  |       y r[   )r-  r  rX  rV  s     r2   rX  z%UMT5EncoderModel.set_input_embeddings  s    $)).9r3   c                     | j                   j                  r1| j                  | j                  j                  | j
                         y y r[   )rJ   r0  r  r  rM  r-  r  s    r2   r  zUMT5EncoderModel._tie_weights  s2    ;;**&&t||'@'@$++N +r3   c                     | j                   S r[   r  r  s    r2   r  zUMT5EncoderModel.get_encoder  r  r3   c                     |j                         D ]D  \  }}| j                  j                  |   j                  d   j                  j                  |       F y)r  r   N)r  r  rP  r   r   r  r  s       r2   r  zUMT5EncoderModel._prune_heads  sP    
 +002 	PLE5LLu%++A.<<HHO	Pr3   r  r   rl  r\  r  re  rn  r   c           	      j    ||n| j                   j                  }| j                  |||||||      }|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```r  )rJ   rf  r  )	r.   r  r   rl  r\  r  re  rn  r  s	            r2   rB   zUMT5EncoderModel.forward  sK    F &1%<k$++B]B],,)'/!5# ' 
 r3   )NNNNNNN)rC   rD   rE   r   r  r  r(   r  rX  r  r  r  r   r   r*   r  r  r   r   r   r   rB   rF   rG   s   @r2   r+  r+  e  s     J78
:
O
P  156:1559,0/3&*-E,,-- !!2!23- E--.	-
   1 12- $D>- 'tn- d^- 
uU&&'8	9- -r3   r+  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $           e Zd ZdgZddgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deee	j                        dee	j                     dee	j                     dee	j                     dee   dee   dee   dee   deeef   f d       Z xZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rJ   c                     t         |   |       t        |      | _        t	        |      | _        | j                          d| _        y r  )r'   r(   r)  r  r  classification_headrS  model_parallelrY   s     r2   r(   z&UMT5ForSequenceClassification.__init__  s>     $V,#9&#A  	#r3   r  r   r  r  rl  r  rm  r  r\  r  r  r  r  re  rn  r   c                    ||n| j                   j                  }|d}|$|	"t        d| j                  j                         | |
|t        d      | j                  |      }| j                  |||||||||	|
||||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d                  dkD  rt        d      |j                   \  }}}||ddf   j#                  |d	|      ddd	ddf   }| j%                  |      }d}||j                  |j                        }| j                   j&                  | j                   j(                  dk(  rd
| j                   _        nv| j                   j(                  dkD  rL|j*                  t        j,                  k(  s|j*                  t        j.                  k(  rd| j                   _        nd| j                   _        | j                   j&                  d
k(  rSt1               }| j                   j(                  dk(  r& ||j3                         |j3                               }n |||      }n| j                   j&                  dk(  rGt5               } ||j#                  d	| j                   j(                        |j#                  d	            }n,| j                   j&                  dk(  rt7               } |||      }|s|f|dd z   }||f|z   S |S t9        |||j:                  |j<                  |j>                  |j@                  |jB                  |jD                  |jF                  	      S )as
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r  r  rl  r  rm  r  r\  r  r  r  re  rn  r   r    z7All examples must have the same number of <eos> tokens.r6   
regressionsingle_label_classificationmulti_label_classificationr  )$rJ   rf  NotImplementedErrorr1   rC   r:  rA  r  eqeos_token_idr8   r   r  r*   unique_consecutivesumr   r   r  problem_typer  r=   r   r   r   squeezer   r   r   rb  r  r  rd  r  r   r  )r.   r  r   r  r  rl  r  rm  r  r\  r  r  r  r  re  rn  r   r  eos_maskr   ru  r/   sentence_representationr  r  r  r  s                              r2   rB   z%UMT5ForSequenceClassification.forward  sR   | &1%<k$++B]B]I!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9/!5+'"7/!5# # 
  "!*<< 8 89<<_=S=STu''Q89A=VWW%4%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r3   )NNNNNNNNNNNNNNN)rC   rD   rE   "_keys_to_ignore_on_load_unexpectedr  r!   r(   r   r   r*   r  r]   r   r  r   r   r   r   rB   rF   rG   s   @r2   r  r    s    +s)s&79VW$z $  15158<=A,0487;=A59=A-1$(,0/3&*!P
E,,-P
 !.P
 $E$4$45	P

 !))9)9 :P
 ELL)P
 $ELL1P
 'u||4P
 "$u'8'8"9:P
   1 12P
  ((9(9:P
 ))*P
 D>P
 $D>P
 'tnP
  d^!P
" 
u55	6#P
 P
r3   r  c                   0    e Zd ZdgZdgZdef fdZe	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee   dee   dee   deee	j                     ef   fd       Z xZS )r2  r  z'transformer.encoder.embed_tokens.weightrJ   c                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r[   )r'   r(   r  r+  r  r   rT   r  rV   rO   r/   r%  rS  rY   s     r2   r(   z#UMT5ForTokenClassification.__init__  sj      +++F3zz&";";<))F$6$68I8IJ 	r3   r  r   rl  r\  r  r  re  rn  r   c	           	         ||n| j                   j                  }| j                  |||||||      }	|	d   }
| j                  |
      }
| j	                  |
      }d}|<t               } ||j                  d| j                        |j                  d            }|s||	dd f}||f|z   S |S t        |||	j                  |	j                        S )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   rl  r\  r  re  rn  r   r6   r5   )r  r  r@   rc  )rJ   rf  r  rV   r%  r   r   r  r   r@   rc  )r.   r  r   rl  r\  r  r  re  rn  r   r@   r  r  r  r  s                  r2   rB   z"UMT5ForTokenClassification.forward  s    6 &1%<k$++B]B]"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDgam,F)-)9TGf$EvE$!//))	
 	
r3   )NNNNNNNN)rC   rD   rE   r  r  r!   r(   r   r   r*   r]   r   r   r   r   rB   rF   rG   s   @r2   r2  r2  {  s    *r)s&CD	z 	  -115,004)-,0/3&*7
ELL)7
 !.7
 ELL)	7

  -7
 &7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 7
r3   r2  c            &       <    e Zd ZddgZ fdZd Zd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                      deeeej                            deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeej                     ef   f"d       Z xZS )r,  r  r  c                 l   t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        d|_        t        || j                        | _        t        j                  |      }d|_
        d|_        |j                  |_        t        || j                        | _        |j$                  | _        t	        j&                  |j                  |j$                        | _        | j+                          y r  )r'   r(   rP   r  r   r   r  r-  r  r  rw   r  r  rK  r  r  rO  r  r  rO   r#  rS  r  s       r2   r(   z!UMT5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#( -2* =v.$(!-2*$*$=$=! = ++))FNNF4E4EF 	r3   c                     | j                   S r[   r  r  s    r2   r  z-UMT5ForQuestionAnswering.get_input_embeddings  r  r3   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r[   r  rV  s     r2   rX  z-UMT5ForQuestionAnswering.set_input_embeddings  r  r3   c                     | j                   j                  ra| j                  | j                  j                  | j
                         | j                  | j                  j                  | j
                         y y r[   r  r  s    r2   r  z%UMT5ForQuestionAnswering._tie_weights  r  r3   c                     | j                   S r[   r  r  s    r2   r  z$UMT5ForQuestionAnswering.get_encoder  r  r3   c                     | j                   S r[   r  r  s    r2   r  z$UMT5ForQuestionAnswering.get_decoder  r  r3   r  r   r  r  rl  r  rm  r  start_positionsend_positionsr\  r  r  r  re  rn  r   c                    ||n| j                   j                  }||n| j                   j                  }|	|
d}| ||t        d      | j	                  |      }||n| j                   j                  }||n| j                   j                  }|| j                  |||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  |||d||||||||	      }|d   }| j                  |      }|j                  dd
      \  }}|j                  d
      j                         }|j                  d
      j                         }d}|	|
t        |	j                               dkD  r*|	j                  d
      j                  |j                         }	t        |
j                               dkD  r*|
j                  d
      j                  |j                         }
|j                  d      }|	j#                  d|      }	|
j#                  d|      }
t%        |      } |||	      } |||
      }||z   dz  }|s||f|dd z   |z   }||f|z   S |S t'        ||||j(                  |j*                  |j,                  |j.                  |j0                  |j*                  |j,                  
      S )aI	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NFr  r  r   r    r5   r  )r  r   r\  rb  r   r  rl  rm  r  r  re  rn  r6   r   r  )
r  start_logits
end_logitsrb  r  r  rd  r  r   r  )rJ   rf  r  r:  rA  r  r\   r   r  r  r#  splitr  r   r   r8   r   r  r   r   rb  r@   rc  rd  ra  )r.   r  r   r  r  rl  r  rm  r  r  r  r\  r  r  r  re  rn  r@   r  r  r  r  r   
total_lossignored_indexr  
start_lossend_lossr  s                                r2   rB   z UMT5ForQuestionAnswering.forward  s9   x &1%<k$++B]B]!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1'!5/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r3   r  )rC   rD   rE   r  r(   r  rX  r  r  r  r   r   r*   r  r  r  r]   r   r   r   r   rB   rF   rG   s   @r2   r,  r,    s   79VW2:O  156:8<=A159=7;@D6:4859=A$(,0/3&*#Z
E,,-Z
 !!2!23Z
 $E$4$45	Z

 !))9)9 :Z
 E--.Z
 $E$5$56Z
 'u||4Z
 "%ell(;"<=Z
 "%"2"23Z
   0 01Z
   1 12Z
  ((9(9:Z
 D>Z
 $D>Z
  'tn!Z
" d^#Z
$ 
uU&&')LL	M%Z
 Z
r3   r,  )r+  r*  r,  r  r2  r)  r  )Fr   r  r   typingr   r   r*   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   configuration_umt5r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrC   r   Moduler%   rI   rb   rj   rt   r   r   r   r  r  rK  r)  r*  r+  r  r2  r,  __all__rI  r3   r2   <module>r6     s      "   A A ! C C ) > 9   .   +  !;J			H	%+BII +4		 .RYY <")) $B)BII B)JRYY 8bii <F* FTRYY $ o!/ o! o!dt# tn	 O
# O
 O
d 
g)#6 g)
g)T i* i iX `
$7 `
`
F I
!4 I
 I
X N
2 N
 N
br3   