
    rhIz                        d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&  e       rddl'm(Z(m)Z)  G d de       Z*e G d de             Z+ G d de      Z, G d de      Z-e G d de+             Z. ed       G d  d!e+e             Z/ G d" d#e      Z0 G d$ d%e"      Z1 G d& d'e      Z2g d(Z3y))zPyTorch PLBART model.    N)OptionalUnion)nn)CrossEntropyLoss   )Cache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_available   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfig)	BlockMaskmake_flex_block_causal_maskc                       e Zd Zy)PLBartScaledWordEmbeddingN__name__
__module____qualname__     |/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/plbart/modular_plbart.pyr    r    7       r&   r    c                      e Zd ZU eed<   dZdZddgZdZdZ	dZ
deej                  df   dej                  fd	Zdeeej                  d
f      dej                  dej                  defdZedej                  dededej(                  dej                  defd       Zdeej                  df   deej                  df   dej,                  dej                  fdZy)PLBartPreTrainedModelconfigmodelTPLBartDecoderLayerPLBartEncoderLayerattention_maskNinputs_embedsc                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S )Nflash_attention_2r   sdpaflex_attentionF)	is_causal	r+   _attn_implementationr   dtype
isinstancetorchTensorr   r   )selfr/   r0   s      r'   _update_full_maskz'PLBartPreTrainedModel._update_full_maskF   s    
 %{{//3FF343F  MQ  11V; "E^UbUhUh!i  115EEnell;%@[`%aN
  "<NML_L_!`r&   r   input_tensorcache_positionpast_key_valuesc           	         | j                   j                  dk(  rqt        |t        j                        rt        |      }|S |Ft        t        j                  |j                  d   |j                  d   f|j                              }|S | j                   j                  dk(  r||dk(  j                         r|S y ||j                         nd}||j                  nd}| j                   j                  dk(  r&|s$t        j                  |||| j                  	      ry |j                  }|j                  d   }|r|j!                         }	n1t        |t        j                        r|j                  d
   n||z   dz   }	| j#                  |||	|||j                  d         }
| j                   j                  dk(  rO|M|j                  j$                  dv r5t        j&                  |      j(                  }t        j*                  |
|      }
|
S )Nr4   r   r   )sizedevicer2   g        Fr3   )r0   past_key_values_lengthis_training)sequence_lengthtarget_lengthr8   r?   
batch_size)cudaxpunpu)r+   r7   r9   r:   r;   r   onesshaperC   anyget_seq_lengthis_compileabler
   _ignore_causal_mask_sdpatrainingr8   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r<   r/   r>   r?   r@   past_seen_tokensusing_compilable_cacher8   rG   rH   causal_mask	min_dtypes               r'   _update_causal_maskz)PLBartPreTrainedModel._update_causal_mask]   s    ;;++/??.%,,7!<^!L "!  '!<JJ*003\5G5G5JK-44" "!;;++/BB)~/D.I.I.K%%
 @O?Z?99;`aCRC^!?!?di ;;++v5>T%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD
 E*..I0CCKQZ[Kr&   rG   rH   r8   rI   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer8   rC   r   )diagonalrC   rF   r   )dimr:   rW   rX   fullrC   triuarangereshapeexpandclonerN   tomasked_fill)r/   rG   rH   r8   r?   rI   kwargsr\   r]   mask_lengthpadding_masks              r'   rU   zKPLBartPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position   s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r&   encoder_hidden_statesencoder_attention_maskinput_shapec                    ||| j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                  |d         }|S | j                   j                  dk(  r-t	        |t
        j                        rt        ||d   d      }|S t        ||j                  |d         }|S )	Nr2   r   r3   rF   )tgt_lenr4   F)query_lengthr5   r6   )r<   rp   rq   rr   r0   s        r'   _update_cross_attn_maskz-PLBartPreTrainedModel._update_cross_attn_mask   s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellC-H.%0_"'.* &%	 *D*M,?,?UW*& &%r&   )r"   r#   r$   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r:   r;   r=   r   r   r^   staticmethodintr8   rU   Sizerv   r%   r&   r'   r*   r*   ;   sN   &*#-/CDNellD01 ||.J u||['@!ABJ llJ 	J
 JX 444 4 {{	4
 4 4 4n!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!&r&   r*   c                       e Zd Zy)PLBartEncoderNr!   r%   r&   r'   r   r     r(   r&   r   c                       e Zd Zy)PLBartDecoderNr!   r%   r&   r'   r   r   
  r(   r&   r   c            &       (    e Zd ZddgZdef fdZd Zd Zd Zd Z	d	 Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej"                        dee   deej"                     deej"                     dee   dee   dee   dee   deej                     deeej                     ef   f"d       Z xZS )PLBartModelencoder.embed_tokens.weightdecoder.embed_tokens.weightr+   c                 v   t         |   |       |j                  |j                  }}|j                  rt        j                  |j                        nd}t        ||j                  ||      | _	        t        || j                        | _        t        || j                        | _        | j                          y )Ng      ?)embed_scale)super__init__pad_token_id
vocab_sizescale_embeddingmathsqrtd_modelr    sharedr   encoderr   decoderinit_weights)r<   r+   padding_idxr   r   	__class__s        r'   r   zPLBartModel.__init__  s     "("5"5v7H7HZ393I3Idii/s/
FNNKepq$VT[[9$VT[[9r&   c                     | j                   S N)r   r<   s    r'   get_input_embeddingsz PLBartModel.get_input_embeddings  s    {{r&   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r   )r   r   embed_tokensr   )r<   values     r'   set_input_embeddingsz PLBartModel.set_input_embeddings!  s)    $(KK!$(KK!r&   c                     | j                   j                  ra| j                  | j                  j                  | j
                         | j                  | j                  j                  | j
                         y y r   )r+   tie_word_embeddings_tie_or_clone_weightsr   r   r   r   r   s    r'   _tie_weightszPLBartModel._tie_weights&  sP    ;;**&&t||'@'@$++N&&t||'@'@$++N +r&   c                     | j                   S r   )r   r   s    r'   get_encoderzPLBartModel.get_encoder+      ||r&   c                     | j                   S r   )r   r   s    r'   get_decoderzPLBartModel.get_decoder.  r   r&   	input_idsr/   decoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputsr@   r0   decoder_inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictr?   returnc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|"| t        || j                   j                        }|| j                  ||||
|||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  |||d   ||||	||||||      }|s||z   S t        |j                  |j                  |j                  |j                   |j"                  |j                  |j                  |j                         S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        N)r   r/   r   r0   r   r   r   r   r   r   )last_hidden_statehidden_states
attentions)r   r/   rp   rq   r   r   r@   r0   r   r   r   r   r?   )r   r@   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterp   encoder_attentions)r+   r   r   r   use_return_dictr   r   r   r9   r   lenr   r   r   r@   r   r   r   )r<   r   r/   r   r   r   r   r   r   r@   r0   r   r   r   r   r   r?   decoder_outputss                     r'   forwardzPLBartModel.forward1  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] $)>)F 29dkk>V>V W""ll#-#+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r&   )NNNNNNNNNNNNNNNN)r"   r#   r$   _tied_weights_keysr   r   r   r   r   r   r   r   r   r:   
LongTensorr;   listFloatTensorr   boolr   tupler   r   __classcell__r   s   @r'   r   r     s   79VW
| 
0
O
  15598<9=,08<7;=A+/59=A$(,0/3&*59#k
E,,-k
 !!1!12k
 $E$4$45	k

 !) 6k
 ELL)k
 $E$4$45k
 'u||4k
 "$u'8'8"9:k
 "%k
   1 12k
  ((9(9:k
 D>k
 $D>k
 'tnk
  d^!k
" !!1!12#k
$ 
uU\\"$66	7%k
 k
r&   r   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc            (           e Zd ZdZdgZg dZdef fdZd Zd Z		 d"d	e
d
ee
   dedej                  f fdZd	e
ddfdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#deej&                     deej&                     deej&                     deej(                     deej(                     deej&                     deej(                     deeej,                        dee   deej,                     deej,                     deej(                     dee   dee   dee   dee   deej&                     deeej(                     ef   f$d        Zdej(                  fd!Z xZS )$PLBartForConditionalGenerationr,   final_logits_bias)r   r   zlm_head.weightr+   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )Nr   r   F)bias)r   r   r   r,   register_bufferr:   zerosr   num_embeddingsr   Linearr   lm_headr   )r<   r+   r   s     r'   r   z'PLBartForConditionalGeneration.__init__  s      (
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^r&   c                 6    | j                   j                         S r   )r,   r   r   s    r'   r   z*PLBartForConditionalGeneration.get_encoder      zz%%''r&   c                 6    | j                   j                         S r   )r,   r   r   s    r'   r   z*PLBartForConditionalGeneration.get_decoder  r   r&   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 z    t         |   |||      }| j                  |j                  j                  d          |S )Nr   )r   resize_token_embeddings_resize_final_logits_biasweightrN   )r<   r   r   r   new_embeddingsr   s        r'   r   z6PLBartForConditionalGeneration.resize_token_embeddings  s?     8I[]jk&&~'<'<'B'B1'EFr&   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )NrF   r   rc   )rd   r   )r   rN   r:   r   rC   catr   )r<   r   old_num_tokensnew_bias
extra_biass        r'   r   z8PLBartForConditionalGeneration._resize_final_logits_bias  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r&   r   r/   r   r   r   r   r   r   r@   r0   r   labelsr   r   r   r   r?   c                    ||n| j                   j                  }|$|"| t        || j                   j                        }| j	                  |||||||||	|
||||||      }| j                  |d         }|| j                  j                  |j                        z   }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Mask-filling:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

        >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

        >>> # en_XX is the language symbol id <LID> for English
        >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

        >>> logits = model(input_ids).logits
        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['first', 'same', 'highest', 'result', 'number']
        ```
        N)r/   r   r   r   r   r   r   r@   r0   r   r   r   r   r   r?   r   rF   r   )	losslogitsr@   r   r   r   r   rp   r   )r+   r   r   r   r,   r   r   rk   rC   r   viewr   r   r@   r   r   r   r   rp   r   )r<   r   r/   r   r   r   r   r   r   r@   r0   r   r   r   r   r   r   r?   outputs	lm_logitsmasked_lm_lossloss_fctoutputs                          r'   r   z&PLBartForConditionalGeneration.forward  s}   V &1%<k$++B]B] (-B-J$6vt{{?W?W$X!**)/+#9/!5+'"7/!5#)!  
$ LL,	 6 6 9 9):J:J KK	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r&   c                 B    t        || j                  j                        S r   )r   r+   r   )r<   r   s     r'   %prepare_decoder_input_ids_from_labelszDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labelsC  s    !&$++*B*BCCr&   )NT)NNNNNNNNNNNNNNNNN)r"   r#   r$   rx   _keys_to_ignore_on_load_missingr   r   r   r   r   r   r   r   r   	Embeddingr   r   r   r:   r   r;   r   r   r   r   r   r   r   r   r   r   s   @r'   r   r     sA     ':&;#i| (( dh!7?}\`	< < <  15598<9=,08<7;=A+/59=A)-$(,0/3&*59%x
E,,-x
 !!1!12x
 $E$4$45	x

 !) 6x
 ELL)x
 $E$4$45x
 'u||4x
 "$u'8'8"9:x
 "%x
   1 12x
  ((9(9:x
 &x
 D>x
 $D>x
  'tn!x
" d^#x
$ !!1!12%x
& 
uU\\"O3	4'x
 x
tDELL Dr&   r   c                       e Zd Zy)PLBartClassificationHeadNr!   r%   r&   r'   r   r   G  r(   r&   r   c                        e Zd Z fdZ xZS )PLBartForSequenceClassificationc                  :     t               j                  di |  y)a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr%   r   r   super_kwargsr   s    r'   r   z'PLBartForSequenceClassification.forwardL  s    B 	','r&   )r"   r#   r$   r   r   r   s   @r'   r   r   K  s    !( !(r&   r   c                   (     e Zd Ze fd       Z xZS )PLBartForCausalLMc                  :     t               j                  di |  y)a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
        >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base", add_cross_attention=False)
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```Nr%   r   r   s    r'   r   zPLBartForCausalLM.forwardq  s    : 	','r&   )r"   r#   r$   r   r   r   r   s   @r'   r   r   p  s    ( (r&   r   )r   r   r   r   r*   )4__doc__r   typingr   r   r:   torch.utils.checkpointr   torch.nnr   cache_utilsr   
generationr	   modeling_attn_mask_utilsr
   r   r   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   bart.modeling_bartr   r   r   r   r   (bigbird_pegasus.modeling_bigbird_pegasusr   mbart.modeling_mbartr   configuration_plbartr   integrations.flex_attentionr   r   r    r*   r   r   r   r   r   r   r   __all__r%   r&   r'   <module>r     s!     "    %   ) 
 
 . A  _ 5 .  !U	 7 	 G&O G& G&T	K 		K 	 N
' N
 N
b 
_D%:O _D
_DD	5 	"(&M "(J( (Br&   