
    rh@                        d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z)dejT                  de+de+fdZ, G d dejZ                        Z. G d dej^                        Z0 G d de      Z1 G d de      Z2 G d dej^                        Z3 G d d ej^                        Z4e# G d! d"e!             Z5 G d# d$e5      Z6 G d% d&e5      Z7e# G d' d(e5             Z8 e#d)*       G d+ d,e5e             Z9 e#d-*       G d. d/e5             Z:e# G d0 d1e5             Z; G d2 d3e5      Z< G d4 d5e5e      Z=g d6Z>y)7zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_idss       w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr)   3   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                   n     e Zd ZdZdedef fdZd	dej                  dedej                  f fdZ xZ	S )
MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr-   r.   	__class__s      r(   r4   z&MvpLearnedPositionalEmbedding.__init__I   s$     $++5}Er*   r   past_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr1   )dtypedevicer!   r   )r#   torcharangelongweightr;   expand	unsqueezer3   forwardr2   )r5   r   r7   r8   bszseq_lenr6   s         r(   rB   z%MvpLearnedPositionalEmbedding.forwardO   s     $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r*   )r   N)
__name__
__module____qualname____doc__intr4   r<   TensorrB   __classcell__r6   s   @r(   r,   r,   D   sH    Fs F3 F; ;s ;^c^j^j ; ;r*   r,   c                       e Zd ZdZ	 	 	 	 ddededee   dee   dee   dee   f fdZ	 	 	 	 	 	 	 dd	e	j                  d
ee	j                     dee   dee	j                     dee	j                     dee	j                     dedee	j                     dee	j                  ee	j                     eee	j                        f   fdZ xZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rS   )r3   r4   rO   rP   rQ   head_dimr%   scalingrR   rT   r   Lineark_projv_projq_projout_proj)r5   rO   rP   rQ   rR   rS   rT   r6   s          r(   r4   zMvpAttention.__init__`   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr*   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskattn_promptoutput_attentionscache_positionreturnc	                 .   |du}	|j                         \  }
}}| j                  |      | j                  z  }|St        |t              rA|j
                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j                  |
d| j                   | j"                        j%                  dd      }|j                  |
d| j                   | j"                        j%                  dd      }|D|	s|nd}j'                  ||| j                  d|i      \  }}|	rd|j
                  | j                  <   |t)        j*                  |d   j-                  |
ddd      |gd      }t)        j*                  |d   j-                  |
ddd      |gd      }|\t)        j.                  |
d||d   j                  d            j1                  |j2                        }t)        j*                  ||gd      }|
| j                   z  d| j"                  f}|j                  |
|| j                   | j"                        j%                  dd      } |j4                  | } |j4                  | } |j4                  | }|j                  d      }t)        j6                  ||j%                  dd            }|j                         |
| j                   z  ||fk7  r/t9        d	|
| j                   z  ||f d
|j                                |{|j                         |
d||fk7  r#t9        d|
d||f d
|j                                |j                  |
| j                   ||      |z   }|j                  |
| j                   z  ||      }t:        j<                  j?                  |d      }||j                         | j                   fk7  r*t9        d| j                   f d
|j                                |j                  dddd      |j                  |
| j                   ||      z  }|j                  |
| j                   z  ||      }|r?|j                  |
| j                   ||      }|j                  |
| j                   z  ||      }nd}t:        j<                  jA                  || j@                  | jB                        }t)        j6                  ||      }|j                         |
| j                   z  || j"                  fk7  r7t9        d|
| j                   || j"                  f d
|j                                |j                  |
| j                   || j"                        }|j%                  dd      }|j5                  |
|| jD                        }| jG                  |      }||fS )z#Input shape: Batch x Time x ChannelNr!   r   r1   re   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )$sizer\   rX   
isinstancer   
is_updatedgetrT   cross_attention_cacheself_attention_cachelayerskeysvaluesrZ   r[   viewrP   rW   	transposeupdater<   catr@   zerostor;   reshapebmmr%   r   
functionalsoftmaxrQ   rl   rO   r]   )r5   r^   r_   r`   ra   rb   rc   rd   re   is_cross_attentionrC   tgt_len_query_statesro   curr_past_key_valuecurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                             r(   rB   zMvpAttention.forward}   s    .T9',,.Wa {{=1DLL@%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn="KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q';q>;N;Nq;QRUUVdVkVkl!&K+Hr!SDNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r*   )        FTN)NNNNNFN)rE   rF   rG   rH   rI   r   floatboolr4   r<   rJ   r   tuplerB   rK   rL   s   @r(   rN   rN   ]   sE   G $'%*#$(CC C %	C
 TNC tnC D>C@ 48*.1526.2"'15|2|||2 #5<<0|2 !	|2
 !.|2 "%,,/|2 ell+|2  |2 !.|2 
u||Xell3XeELL>Q5RR	S|2r*   rN   c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dej                  dee   de	ej                  eej                     f   fd	Z
 xZS )MvpEncoderLayerconfigc                 f   t         |           |j                  | _        t	        | j                  |j
                  |j                        | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)rO   rP   rQ   )r3   r4   d_modelrO   rN   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrQ   r
   activation_functionactivation_fnactivation_dropoutrY   encoder_ffn_dimfc1fc2final_layer_normr5   r   r6   s     r(   r4   zMvpEncoderLayer.__init__   s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r*   r^   ra   rb   self_attn_promptrd   rf   c                    |}| j                  |||||      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rt        j                  |      j                         s#t        j                   |      j                         rEt        j"                  |j                        j$                  dz
  }t        j&                  || |      }||fS )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r^   ra   rb   rc   rd   rj   i  )minmax)r   r   r~   rQ   rl   r   r   r   r   r   r   r:   r<   float16isinfanyisnanfinfor   clamp)	r5   r^   ra   rb   r   rd   residualr   clamp_values	            r(   rB   zMvpEncoderLayer.forward  s   * !&*nn')+(/ '5 '
#| --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/KK&**,M0J0N0N0P++m&9&9:>>EK!KKK<[YMl**r*   )F)rE   rF   rG   r   r4   r<   FloatTensorr   r   r   rB   rK   rL   s   @r(   r   r      s    =y =, -2/+((/+ ))/+ **	/+
  ++/+ $D>/+ 
u  (5+<+<"==	>/+r*   r   c                        e Zd Zddef fdZ	 	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee	   dee	   deej                     de
ej                  ee
ej                  ej                  f      f   fdZ xZS )MvpDecoderLayerr   c                    t         |           |j                  | _        t	        | j                  |j
                  |j                  d|      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d|      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)rO   rP   rQ   rR   rT   )rQ   rR   rT   )r3   r4   r   rO   rN   decoder_attention_headsr   r   rQ   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normrY   decoder_ffn_dimr   r   r   )r5   r   rT   r6   s      r(   r4   zMvpDecoderLayer.__init__@  s   %nn44,,
 ~~#F$>$>?"(";";$&LL$@!(NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r*   r^   ra   encoder_hidden_statesencoder_attention_maskrb   cross_attn_layer_head_maskr   cross_attn_promptr`   rd   	use_cachere   rf   c           	      4   |}| j                  ||	||||
|      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }d}|i|}| j                  ||||||	|
      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|f}|
r|||fz  }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r^   r`   ra   rb   rc   rd   re   rj   N)r^   r_   ra   rb   rc   r`   rd   )r   r   r~   rQ   rl   r   r   r   r   r   r   r   r   )r5   r^   ra   r   r   rb   r   r   r   r`   rd   r   re   r   self_attn_weightscross_attn_weightsoutputss                    r(   rB   zMvpDecoderLayer.forward\  s   J ! ,0>>'))+(/) ,: ,
(( --mt||VZVcVc-d =011-@ " ,$H040A0A+!65 :--"3 1B 1-M- MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m< ")+=>>Gr*   N)NNNNNNNNFTN)rE   rF   rG   r   r4   r<   rJ   r   r   r   r   r   rB   rK   rL   s   @r(   r   r   ?  sH   =y => 268<9=26=A3748*.,1$(15U||U !.U  (5	U
 !) 6U "%,,/U %-U\\$:U #5<<0U $ELL1U !U $D>U D>U !.U 
u  (51B1BEDUDU1U+V"WW	XUr*   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y )Nrk   )r3   r4   r   rY   denseDropoutrQ   r]   )r5   r   r   r   r   r6   s        r(   r4   zMvpClassificationHead.__init__  sD     	YYy)4
zzN3		)[9r*   r^   rf   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r   )rQ   r   r<   tanhr]   )r5   r^   s     r(   rB   zMvpClassificationHead.forward  sN    ]3

=1

=1]3m4r*   )rE   rF   rG   rH   rI   r   r4   r<   rJ   rB   rK   rL   s   @r(   r   r     sL    7
:
: 
: 	
:
 
:U\\ ell r*   r   c                   `     e Zd ZdZ fdZdej                  deej                     fdZ xZ	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c           	      8   t         |           |j                  | _        || _        || _        |j
                  |z  | _        t        j                  |j                        | _	        t        j                  |j                  |j
                        | _        t        j                  t        j                  |j
                  |j                        t        j                         t        j                  |j                  |dz  |j
                  z              | _        y )Nr   r1   )r3   r4   prompt_length
num_layersrP   r   rW   r   r   rQ   	Embeddingprompt_embedding
SequentialrY   prompt_mid_dimGELUprompt_trans)r5   r   r   rP   r6   s       r(   r4   zMvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r*   
prompt_idsrf   c                 *   | j                  | j                  |            }|j                  | j                  | j                  dz  | j
                  | j                        }| j                  |      }|j                  g d      j                  d      }|S )Nr1   )r   r1   r   r	   )
r   r   rv   r   r   rP   rW   rQ   permutesplit)r5   r   prompts      r(   rB   zMvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r*   )
rE   rF   rG   rH   r4   r<   rJ   r   rB   rK   rL   s   @r(   r   r     s+    3
%,, 53F r*   r   c                   6    e Zd ZU eed<   dZdZd Zed        Z	y)MvpPreTrainedModelr   modelTc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Nr   )meanstd)r   init_stdrn   r   rY   r?   datanormal_rS   zero_r   padding_idx)r5   moduler   s      r(   _init_weightsz MvpPreTrainedModel._init_weights  s    kk""fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r*   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      |d}|S )N)r      
      r1   r         r1   r;   )ra   r   )r   r   r<   tensorr;   ne)r5   	pad_tokenr   dummy_inputss       r(   r   zMvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r*   N)
rE   rF   rG   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr    r*   r(   r   r     s-    &*#	?  r*   r   c                       e Zd ZdZ	 ddedeej                     dee   f fdZ		 	 	 	 	 	 	 ddee
j                     dee
j                     dee
j                     d	ee
j                     d
ee   dee   dee   deeef   fdZ xZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   embed_tokens
use_promptc                 2   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd| _        ||| _        n0t        j                   |j"                  || j                        | _        t%        |j                  |      | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t        j2                  |      | _        || _        |r7|j8                  | _        t;        ||j,                  |j<                        | _        d| _         | jC                          y c c}w )N      ?F)"r3   r4   rQ   encoder_layerdrop	layerdropr   r   r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   r   
vocab_sizer,   embed_positions
ModuleListrangeencoder_layersr   rs   r   layernorm_embeddingr   r   r   r   r   gradient_checkpointing	post_init)r5   r   r   r   rO   r   r6   s         r(   r4   zMvpEncoder.__init__  sD    	 ~~11NN	!..$*$B$B!393I3I499Y/s# ,D "V->->	4K[K[ \D<** 
 mmeFLaLaFb$c_V%<$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   Fr   ra   	head_maskinputs_embedsrd   output_hidden_statesreturn_dictrf   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |$|}|j
                  }	|j                  d|	d         }n-| |j                         dd }	|dddddf   }nt	        d      || j                  |      | j                  z  }| j                  |      }
||
z   }| j                  |      }t        j                  j                  || j                  | j                        }| j                   rIt#        j$                  | j&                        j)                  | j*                        }| j-                  |      }|t/        ||j0                        }|rdnd}|rdnd}|_|j                         d   t3        | j4                        k7  r6t	        dt3        | j4                         d	|j                         d    d
      t7        | j4                        D ]|  \  }}|r||fz   }d}| j                  r&t#        j8                  g       }|| j:                  k  rd}|rd}n* ||||||   nd| j                   r|   nd|      }|d   }|st||d   fz   }~ |r||fz   }|st=        d |||fD              S t?        |||      S )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   z5You have to specify either input_ids or inputs_embedsrj   r   r   z&The head_mask should be specified for  layers, but it is for .FT)NN)rb   r   rd   r   c              3   &   K   | ]	  }||  y wr   r   .0vs     r(   	<genexpr>z%MvpEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   last_hidden_stater^   
attentions) r   rd   r  use_return_dictr%   r#   rv   rm   r   r	  r  r  r   r~   rQ   rl   r   r<   r=   r   r{   r;   r   r   r:   lenrs   	enumeraterandr  r   r   )r5   r   ra   r  r  rd   r  r  inputinput_shape	embed_posr^   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r(   rB   zMvpEncoder.forward3  s   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %7H[H[\N30d  ~~"s4;;'78 <S=M<N O!(+,A/ 
 #,DKK"8 	FC#!/=2B!BG}}&+jjn#&7"G , -!"7@7LYs^RV?C&6s&;TX&7! !.a 0 !/=3C2E!E1	F4  +}.>>Ne]NN$Seee+>Vd
 	
r*   NF)NNNNNNN)rE   rF   rG   rH   r   r   r   r   r   r4   r<   
LongTensorrJ   r   r   r   r   rB   rK   rL   s   @r(   r   r     s     lq$$/7/E$ZbcgZh$P 1515,059,0/3&*@
E,,-@
 !.@
 ELL)	@

   1 12@
 $D>@
 'tn@
 d^@
 
uo%	&@
r*   r   c                       e Zd ZdZ	 ddedeej                     dee   f fdZ		 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                     dee
j                     dee
j                     d	ee
j                     d
ee
j                     dee
j                     deee
j                        dee
j                     dee   dee   dee   dee   dee
j                     deeef   fdZ xZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   r   r   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        ||| _        n:t        j                   |j"                  |j                  | j                        | _        t%        |j                  |j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t        j2                  |j                        | _        || _        |r]|j8                  | _        t;        ||j,                  |j<                        | _        t;        ||j,                  |j<                        | _         d| _!        | jE                          y c c}w )Nr  )rT   F)#r3   r4   rQ   decoder_layerdropr  r   r   r  max_target_positionsr  r  r  r   r	  r   r   r   r
  r,   r  r  r  decoder_layersr   rs   r   r  r   r   r   r   r   r   r  r  )r5   r   r   r   ir6   s        r(   r4   zMvpDecoder.__init__  su    	 ~~11!..$*$B$B!8>8N8N499V^^4TW# ,D "V->->PTP`P` aD<**NN 
 mmSXY_YnYnSo$pa_Vq%I$pq#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %qs   Gr   ra   r   r   r  cross_attn_head_maskpast_key_valuesr  r   rd   r  r  re   rf   c                    |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||t        d      |$|}|j                  }|j                  d|d         }n-| |j                         dd }|dddddf   }nt        d      || j                  |      | j                  z  }| j                  r%| j                  r|	rt        j                  d       d}	d}|	r<t        |t               s,t        j                  d       d}t#        j$                  |      }||j'                         nd	}t)        ||||      }||t+        ||j,                  |d   
      }| j/                  ||      }||z   }| j1                  |      }t2        j4                  j7                  || j6                  | j                        }| j8                  rZt;        j<                  | j>                        jA                  | jB                        }| jE                  |      }| jG                  |      }|rdnd}|
rdnd}|
r|dnd}tI        ||gddg      D ]j  \  }}|	|j                         d	   tK        | jL                        k7  s3t        d| dtK        | jL                         d|j                         d	    d       tO        | jL                        D ]  \  }}|r||fz  }| j                  r%t;        jP                  g       }|| jR                  k  r? ||||||||   nd|||   nd| j8                  r|   nd| j8                  r|   nd||
|	|      }|d	   }|
s||d   fz  }|||d   fz  } |r||fz  }|r|jU                         }|stW        d |||||fD              S tY        |||||      S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer!   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )r   rj   r   r  r8  zThe `z` should be specified for r  r  )	r   rb   r   r   r   r`   rd   r   re   r   r1   c              3   $   K   | ]  }|| 
 y wr   r   r  s     r(   r  z%MvpDecoder.forward.<locals>.<genexpr>  s      = s   )r  r9  r^   r   cross_attentions)-r   rd   r  r   r!  r%   r#   rv   rm   r   r	  r  rl   loggerwarning_oncern   r   r   from_legacy_cacheget_seq_lengthr   r   r:   r  r  r   r~   rQ   r   r<   r=   r   r{   r;   r   r   zipr"  rs   r#  r$  r  to_legacy_cacher   r   ) r5   r   ra   r   r   r  r8  r9  r  r   rd   r  r  re   r%  r&  return_legacy_cacher7   	positionsr^   r   r   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer*  decoder_layerr-  r.  s                                    r(   rB   zMvpDecoder.forward  sz   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee  --i84;K;KKM&&4==##p "	#Z?\
 #'1CCOTOETE`!?!?!Afg:K8N

 !,1G1S%?&(;(;[QS_&"
 ((0FG	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B #7BD0d&7<Q<]rdh %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 	@C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos;???"23"7PT=A__#4S#9RV."3#-M *!,M =#3"55(4(]1-=,??(9	@>  -!11-==?O ':K^]qr  
 9+++%1
 	
r*   r/  )NNNNNNNNNNNNN)rE   rF   rG   rH   r   r   r   r   r   r4   r<   r0  rJ   r   listr   r   r   rB   rK   rL   s   @r(   r2  r2    s~    lq&&/7/E&ZbcgZh&T 1515=A=A,07;=A59$(,0/3&*15R
E,,-R
 !.R
  ((9(9:	R

 !))9)9 :R
 ELL)R
 'u||4R
 "$u'8'8"9:R
   1 12R
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
u??	@R
r*   r2  c            &       .    e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
d
 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                      deej                     deej                     deej                      deej                      deej                      deeej$                        deeej$                        deej$                     deej$                     dee   dee   dee   dee   deej                      deeef   f"d       Z xZS )MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                 z   t         |   |       |j                  |j                  }}|j                  | _        t        j                  ||j                  |      | _        t        || j                  |j                        | _
        t        || j                  |j                        | _        | j                          y r   )r3   r4   r   r
  r   r   r   r   sharedr   encoderr2  decoderr  )r5   r   r   r
  r6   s       r(   r4   zMvpModel.__init__  s     "("5"5v7H7HZ ++ll:v~~{K!&$++v7H7HI!&$++v7H7HI 	r*   c                     | j                   S r   )rR  r5   s    r(   get_input_embeddingszMvpModel.get_input_embeddings  s    {{r*   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r   )rR  rS  r   rT  r5   values     r(   set_input_embeddingszMvpModel.set_input_embeddings  s)    $(KK!$(KK!r*   c                     | j                   S r   )rS  rV  s    r(   get_encoderzMvpModel.get_encoder      ||r*   c                     | j                   S r   rT  rV  s    r(   get_decoderzMvpModel.get_decoder  r^  r*   c                 *   | j                   sJ d       | j                  d       | j                  j                  j                  d       | j                  j                  j                  d       | j                  j
                  j                  d       y )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_rS  r   rT  r   rV  s    r(   set_lightweight_tuningzMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r*   r   ra   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr8  encoder_outputsr9  r  decoder_inputs_embedsr   rd   r  r  re   rf   c                 <   |D|B|t        d      t        || j                  j                  | j                  j                        }||n| j                  j
                  }||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|| j                  ||||
|||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  |||d   ||||	||||||      }|s||z   S t        |j                  |j                   |j"                  |j$                  |j&                  |j                  |j"                  |j$                  	      S )
a*  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   ra   r  r  rd   r  r  r   r   r1   r  )r   ra   r   r   r  r8  r9  r  r   rd   r  r  re   )r  r9  decoder_hidden_statesdecoder_attentionsr<  encoder_last_hidden_stater   encoder_attentions)r%   r)   r   r   r   rd   r  r   r!  rS  rn   r   r"  rT  r   r  r9  r^   r   r<  )r5   r   ra   re  rf  r  rg  r8  rh  r9  r  ri  r   rd   r  r  re   decoder_outputss                     r(   rB   zMvpModel.forward  s   f $)>)F  U  !34;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]""ll#-#+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r*   NNNNNNNNNNNNNNNN)rE   rF   rG   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r4   rW  r[  r]  ra  rd  r   r   r<   r0  rJ   rK  r   r   r   r   r   rB   rK   rL   s   @r(   rM  rM    s   *=)>&79VWy 0
<  15158<=A,0487;=A=A59=A$(,0/3&*15#t
E,,-t
 !.t
 $E$4$45	t

 !))9)9 :t
 ELL)t
 $ELL1t
 'u||4t
 "$u'8'8"9:t
 "$u'8'8"9:t
   1 12t
  ((9(9:t
 D>t
 $D>t
 'tnt
  d^!t
" !.#t
$ 
u((	)%t
 t
r*   rM  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            (           e Zd Zg dZdef fdZd Zd Z	 d!dede	e   d	e
d
ej                  f fdZded
dfdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"de	ej$                     de	ej&                     de	ej$                     de	ej$                     de	ej&                     de	ej&                     de	ej&                     de	eej*                        de	eej*                        de	ej*                     de	ej*                     de	ej$                     de	e
   de	e
   de	e
   de	e
   de	ej&                     d
eeef   f$d       Zdej&                  fd Z xZS )#MvpForConditionalGeneration)rO  rP  lm_head.weightr   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )NrN  r   FrV   )r3   r4   rM  r   register_bufferr<   rz   rR  r-   r   rY   r   lm_headr  r   s     r(   r4   z$MvpForConditionalGeneration.__init__e  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r*   c                 6    | j                   j                         S r   )r   r]  rV  s    r(   r]  z'MvpForConditionalGeneration.get_encodern      zz%%''r*   c                 6    | j                   j                         S r   )r   ra  rV  s    r(   ra  z'MvpForConditionalGeneration.get_decoderq  r{  r*   Nnew_num_tokenspad_to_multiple_ofmean_resizingrf   c                 L    t         |   |||      }| j                  |       |S r   )r3   resize_token_embeddings_resize_final_logits_bias)r5   r}  r~  r  new_embeddingsr6   s        r(   r  z3MvpForConditionalGeneration.resize_token_embeddingst  s.     8I[]jk&&~6r*   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr!   r   r   rh   rN  )rN  r#   r<   rz   r;   ry   rx  )r5   r}  old_num_tokensnew_bias
extra_biass        r(   r  z5MvpForConditionalGeneration._resize_final_logits_bias{  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r*   c                 n    | j                   j                          | j                  j                  d       y r/  r   rd  ry  rc  rV  s    r(   rd  z2MvpForConditionalGeneration.set_lightweight_tuning  $    

))+##E*r*   r   ra   re  rf  r  rg  r8  rh  r9  r  ri  labelsr   rd   r  r  re   c                    ||n| j                   j                  }|R|rt        j                  d       d}|7|5t	        || j                   j
                  | j                   j                        }| j                  |||||||||	|
||||||      }| j                  |d         | j                  z   }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  	      S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)ra   re  rh  rf  r  rg  r8  r9  r  ri  r   rd   r  r  re   r   r!   r   	losslogitsr9  rk  rl  r<  rm  r   rn  )r   r!  r=  warningr)   r   r   r   ry  rN  r   rv   r
  r   r9  rk  rl  r<  rm  r   rn  )r5   r   ra   re  rf  r  rg  r8  rh  r9  r  ri  r  r   rd   r  r  re   r   	lm_logitsmasked_lm_lossloss_fctoutputs                          r(   rB   z#MvpForConditionalGeneration.forward  s   d &1%<k$++B]B]klI (-B-J$6DKK44dkk6X6X%! **)/+#9/!5+'"7/!5#)!  
$ LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   c                 l    t        || j                  j                  | j                  j                        S r   )r)   r   r   r   )r5   r  s     r(   %prepare_decoder_input_ids_from_labelszAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s%    !&$++*B*BDKKDfDfggr*   )NT)NNNNNNNNNNNNNNNNN)rE   rF   rG   rr  r   r4   r]  ra  rI   r   r   r   r   r  r  rd  r   r<   r0  rJ   rK  r   r   r   r   rB   r  rK   rL   s   @r(   ru  ru  ]  s9    jy (( dh!7?}\`	< < <+  15158<=A,0487;=A=A59=A-1$(,0/3&*15%C
E,,-C
 !.C
 $E$4$45	C

 !))9)9 :C
 ELL)C
 $ELL1C
 'u||4C
 "$u'8'8"9:C
 "$u'8'8"9:C
   1 12C
  ((9(9:C
 ))*C
 D>C
 $D>C
  'tn!C
" d^#C
$ !.%C
& 
uo%	&'C
 C
JhELL hr*   ru  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $           e Zd ZddgZdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deee	j                        dee	j                     dee	j                     dee	j                     dee   dee   dee   dee   deeef   f d       Z xZS )MvpForSequenceClassificationrO  rP  r   c                     t        |   |fi | t        |      | _        t	        |j
                  |j
                  |j                  |j                        | _        | j                          y r   )
r3   r4   rM  r   r   r   
num_labelsclassifier_dropoutclassification_headr  )r5   r   kwargsr6   s      r(   r4   z%MvpForSequenceClassification.__init__  sZ    *6*f%
#8NNNN%%	$
  	r*   c                 n    | j                   j                          | j                  j                  d       y r/  )r   rd  r  rc  rV  s    r(   rd  z3MvpForSequenceClassification.set_lightweight_tuning(  s&    

))+  //6r*   r   ra   re  rf  r  rg  r8  rh  r  ri  r  r   rd   r  r  rf   c                    ||n| j                   j                  }|d}|$|	"t        d| j                  j                         | j                  |||||||||	|
||||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d                  dkD  rt        d      ||ddf   j                  |j!                  d      d|j!                  d            dddddf   }| j#                  |      }d}|| j                   j$                  | j                   j&                  dk(  rd	| j                   _        nv| j                   j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j,                  k(  rd
| j                   _        nd| j                   _        | j                   j$                  d	k(  rSt/               }| j                   j&                  dk(  r& ||j1                         |j1                               }n |||      }n| j                   j$                  d
k(  rGt3               } ||j                  d| j                   j&                        |j                  d            }n,| j                   j$                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                  |j>                  |j@                  |jB                  |jD                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for ra   re  rf  r  rg  r8  rh  r  ri  r   rd   r  r  r   r   z7All examples must have the same number of <eos> tokens.r!   
regressionsingle_label_classificationmulti_label_classificationr  )#r   r!  NotImplementedErrorr6   rE   r   eqeos_token_idr{   r;   r"  r<   unique_consecutivesumr%   rv   rm   r  problem_typer  r:   r>   rI   r   squeezer   r   r   r9  rk  rl  r<  rm  r   rn  )r5   r   ra   re  rf  r  rg  r8  rh  r  ri  r  r   rd   r  r  r   r^   eos_masksentence_representationr  r  r  r  s                           r(   rB   z$MvpForSequenceClassification.forward,  s   Z &1%<k$++B]B]I!:%J4>>KbKbJcd  **)/#9/!5+'"7/!5#  
   
<< 8 89<<]=Q=QRu''Q89A=VWW"/!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   )NNNNNNNNNNNNNNN)rE   rF   rG   rr  r   r4   rd  r   r   r<   r0  rJ   rK  r   r   r   r   r   rB   rK   rL   s   @r(   r  r    s    89VWy 7  15158<=A,0487;=A59=A-1$(,0/3&*!T
E,,-T
 !.T
 $E$4$45	T

 !))9)9 :T
 ELL)T
 $ELL1T
 'u||4T
 "$u'8'8"9:T
   1 12T
  ((9(9:T
 ))*T
 D>T
 $D>T
 'tnT
  d^!T
" 
u55	6#T
 T
r*   r  c            &           e Zd ZddgZ fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deeej                        deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   f"d       Z xZS )MvpForQuestionAnsweringrO  rP  c                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r0   )
r3   r4   r  rM  r   r   rY   hidden_size
qa_outputsr  r   s     r(   r4   z MvpForQuestionAnswering.__init__  s[      ++f%
))F$6$68I8IJ 	r*   c                 n    | j                   j                          | j                  j                  d       y r/  )r   rd  r  rc  rV  s    r(   rd  z.MvpForQuestionAnswering.set_lightweight_tuning  s$    

))+&&u-r*   r   ra   re  rf  r  rg  r8  rh  start_positionsend_positionsr  ri  r   rd   r  r  rf   c                    ||n| j                   j                  }|	|
d}| j                  ||||||||||||||      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}|	|
t        |	j                               dkD  r|	j                  d      }	t        |
j                               dkD  r|
j                  d      }
|j                  d      }|	j                  d|      }	|
j                  d|      }
t        |      } |||	      } |||
      }||z   d	z  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  |j                  |j                  |j                   |j"                  |j$                  

      S )a`  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr  r   r   r!   rh   )ignore_indexr1   )
r  start_logits
end_logitsr9  rk  rl  r<  rm  r   rn  )r   r!  r   r  r   r  
contiguousr"  rm   r   r   r   r9  rk  rl  r<  rm  r   rn  )r5   r   ra   re  rf  r  rg  r8  rh  r  r  r  ri  r   rd   r  r  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                               r(   rB   zMvpForQuestionAnswering.forward  s   f &1%<k$++B]B]&=+DI**)/#9/!5+'"7/!5#  
" "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r*   rp  )rE   rF   rG   rr  r4   rd  r   r   r<   rJ   r0  rK  r   r   r   r   r   rB   rK   rL   s   @r(   r  r    s   79VW
.  -1158<=A,0487;=A6:4859=A$(,0/3&*#Q
ELL)Q
 !.Q
 $E$4$45	Q

 !))9)9 :Q
 ELL)Q
 $ELL1Q
 'u||4Q
 "$u'8'8"9:Q
 "%"2"23Q
   0 01Q
   1 12Q
  ((9(9:Q
 D>Q
 $D>Q
  'tn!Q
" d^#Q
$ 
u99	:%Q
 Q
r*   r  c                   (     e Zd ZdZ fdZd Z xZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 D    t         |   |       t        |      | _        y r   )r3   r4   r2  rT  r   s     r(   r4   zMvpDecoderWrapper.__init__t  s     !&)r*   c                 &     | j                   |i |S r   r`  )r5   argsr  s      r(   rB   zMvpDecoderWrapper.forwardx  s    t||T,V,,r*   )rE   rF   rG   rH   r4   rB   rK   rL   s   @r(   r  r  n  s    
*-r*   r  c            "           e Zd ZdgZ fdZd Zd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deeej                        deej                     deej                     dee   dee   dee   dee   deej                     deeef   fd       Z xZS )MvpForCausalLMrv  c                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFrV   )rR   is_encoder_decoderr3   r4   r  r   r   rY   r  r
  ry  r  r   s     r(   r4   zMvpForCausalLM.__init__  sX     $)! &v.
yy!3!3V5F5FUS 	r*   c                 B    | j                   j                  j                  S r   r   rT  r   rV  s    r(   rW  z#MvpForCausalLM.get_input_embeddings  s    zz!!...r*   c                 :    || j                   j                  _        y r   r  rY  s     r(   r[  z#MvpForCausalLM.set_input_embeddings  s    */

'r*   c                 &    || j                   _        y r   r   rT  )r5   rT  s     r(   set_decoderzMvpForCausalLM.set_decoder  s    $

r*   c                 .    | j                   j                  S r   r  rV  s    r(   ra  zMvpForCausalLM.get_decoder  s    zz!!!r*   c                 n    | j                   j                          | j                  j                  d       y r/  r  rV  s    r(   rd  z%MvpForCausalLM.set_lightweight_tuning  r  r*   r   ra   r   r   r  r8  r9  r  r  r   rd   r  r  re   rf   c                 D   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j                  |||||||||
|||      }| j                  |d         }d}|	Ft               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```N)r   ra   r   r   r  r8  r9  r  r   rd   r  r  r   r!   r   )r  r  r9  r^   r   r<  )r   rd   r  r!  r   rT  ry  r   rv   r
  r   r9  r^   r   r<  )r5   r   ra   r   r   r  r8  r9  r  r  r   rd   r  r  re   r   r  r  r  r  s                       r(   rB   zMvpForCausalLM.forward  sF   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 gaj)')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r*   )NNNNNNNNNNNNNN)rE   rF   rG   rr  r4   rW  r[  r  ra  rd  r   r   r<   r0  rJ   r   rK  r   r   r   r   rB   rK   rL   s   @r(   r  r  |  s   *+	/0%"+  1515=A>B,07;=A59-1$(,0/3&*15T
E,,-T
 !.T
  ((9(9:	T

 !)):): ;T
 ELL)T
 'u||4T
 "$u'8'8"9:T
   1 12T
 ))*T
 D>T
 $D>T
 'tnT
 d^T
 !.T
  
u77	8!T
 T
r*   r  )r  ru  r  r  rM  r   )?rH   r  typingr   r   r<   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mvpr   
get_loggerrE   r=  rJ   rI   r)   r   r,   ModulerN   r   r   r   r   r   r   r2  rM  ru  r  r  r  r  __all__r   r*   r(   <module>r     s     "    A A ! 5 ) :   . , ( 
		H	%%,, c [^ ";BLL ;2\2299 \2~@+0 @+Fr0 rlBII 0		 2   6q
# q
hD
# D
N \
! \
 \
~ 
mh"4o mh
mh` i
#5 i
i
X e
0 e
 e
R-* -s
' s
lr*   