
    rh`                       d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZmZmZmZmZm Z  dd
l!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z> ddl?m@Z@mAZAmBZB ddlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL ddl!mMZMmNZNmOZOmPZPmQZQ ddlRmSZS ddlTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZn ddlompZpmqZqmrZrmsZsmtZtmuZumvZv erddlwmxZx ddlymzZz ddl{m|Z|  e;j                  e~      Z e7       rd dlmZmZ g dZe G d  d!e6             Ze G d" d#e6             Ze G d$ d%e6             Ze G d& d'e6             ZeZeZeZeZeZeZeZeZeZeZeeef   Zeeef   Zeeef   Zeeef   Zeeef   Zeeef   Zeeef   Zeeef   Z G d( d)eS      Zd* ZdDd+Zd,ej:                  d-ej:                  d.ej:                  d/ej<                  d0ed1ed2ej:                  fd3Zd4ee6   d5e"d2e6fd6Zd7 ed8       d9dfd:ej:                  d;ej:                  d<ed=ed>ed2ej:                  fd?Zd@ee   dAeeej:                  f   dBej:                  d2ej:                  fdCZy)E    N)	dataclass)TYPE_CHECKINGAnyCallableOptionalUnion)file_exists)version)nn)
functional   )CacheDynamicCacheEncoderDecoderCacheHybridChunkedCacheOffloadedCacheOffloadedHybridCache)PretrainedConfig)check_python_requirementsget_cached_module_fileget_class_in_moduleresolve_trust_remote_code)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_masks_for_generate)CausalLMOutputWithPastSeq2SeqLMOutput)isin_mps_friendly)ExtensionsTrie)ModelOutputis_accelerate_availableis_hqq_availableis_optimum_quanto_availableis_torchdynamo_exportinglogging   )DisjunctiveConstraintPhrasalConstraint)
BeamScorerBeamSearchScorerConstrainedBeamSearchScorer)	AssistantVocabTranslatorCacheAssistedCandidateGenerator-AssistedCandidateGeneratorDifferentTokenizersCandidateGeneratorEarlyExitCandidateGeneratorPromptLookupCandidateGenerator%UniversalSpeculativeDecodingGenerator_prepare_attention_mask_prepare_token_type_ids) NEED_SETUP_CACHE_CLASSES_MAPPINGQUANT_BACKEND_CLASSES_MAPPINGCompileConfigGenerationConfigGenerationMode)ContinuousMixin)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorHammingDiversityLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorMinPLogitsWarperNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)ConfidenceCriteriaEosTokenCriteriaMaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListStopStringCriteria)PreTrainedModel)PreTrainedTokenizerBase)BaseStreamer)AlignDevicesHookadd_hook_to_module)past_key_valuescache_paramsstatememspast_buckets_statesc                   .   e Zd ZU dZej
                  ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	e	ej                           ed<   dZee	e	ej                           ed<   dZee	e	e	ej                              ed<   y)	GenerateDecoderOnlyOutputaw  
    Outputs of decoder-only generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    	sequencesNscoreslogits
attentionshidden_statesra   )__name__
__module____qualname____doc__torch
LongTensor__annotations__ri   r   tupleFloatTensorrj   rk   rl   ra        p/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/generation/utils.pyrg   rg      s    4 15FHU5,,-.515FHU5,,-.5<@JuU%6%6789@?CM8E%(9(9":;<CHLOXeE%0A0A*B$CDELrw   rg   c                      e Zd ZU dZej
                  ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	e	ej                           ed<   dZee	e	ej                           ed	<   dZee	e	ej                           ed
<   dZee	e	e	ej                              ed<   y)GenerateEncoderDecoderOutputaI  
    Outputs of encoder-decoder generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rh   Nri   rj   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesra   )rm   rn   ro   rp   rq   rr   rs   ri   r   rt   ru   rj   r{   r|   r}   r~   r   ra   rv   rw   rx   rz   rz      s   !F 15FHU5,,-.515FHU5,,-.5=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDKHLOXeE%0A0A*B$CDELrw   rz   c                   ~   e Zd ZU dZej
                  ed<   dZeej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeej
                     ed<   dZeeeej                           ed<   dZeeeej                           ed	<   dZeeeeej                              ed
<   y)GenerateBeamDecoderOnlyOutputa
  
    Outputs of decoder-only generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rh   Nsequences_scoresri   rj   beam_indicesrk   rl   ra   )rm   rn   ro   rp   rq   rr   rs   r   r   ru   ri   rt   rj   r   rk   rl   ra   rv   rw   rx   r   r      s    @ 48hu001815FHU5,,-.515FHU5,,-.5/3L(5++,3<@JuU%6%6789@?CM8E%(9(9":;<CHLOXeE%0A0A*B$CDELrw   r   c                      e Zd ZU dZej
                  ed<   dZeej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeej
                     ed<   dZeeej                        ed<   dZeeej                        ed	<   dZeeeej                           ed
<   dZeeeej                           ed<   dZeeeej                           ed<   dZeeeeej                              ed<   y) GenerateBeamEncoderDecoderOutputa  
    Outputs of encoder-decoder generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rh   Nr   ri   rj   r   r{   r|   r}   r~   r   ra   )rm   rn   ro   rp   rq   rr   rs   r   r   ru   ri   rt   rj   r   r{   r|   r}   r~   r   ra   rv   rw   rx   r   r     s/   (T 48hu001815FHU5,,-.515FHU5,,-.5/3L(5++,3=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDKHLOXeE%0A0A*B$CDELrw   r   c            $          e Zd ZdZ	 	 ddeeeej                  f      dee	   de
fdZdej                  deej                     d	eej                     deej                  ej                  f   fd
Zdej                  deej                     d	eej                     deej                  ej                  f   fdZ	 	 	 	 ddej                  dee   deej                     deej                     d	eej                     f
dZ	 	 	 ddeej(                     deej(                     deeeej(                  f      deej(                  ee   eeej(                  f   f   fdZ	 	 	 ddeej(                     deej(                     deeeej(                  f      dej                  fdZdej(                  dedeeef   dej                  fdZdej(                  dee   dedeeef   fdZ	 ddededeeej(                  f   dej(                  deej:                     deej                  eeej(                  f   f   fdZe	 	 	 ddede	deej                     deej                  eeef   f   fd       Z 	 	 dd e!deeef   de	d!edeeef   f
d"Z"dedej                  dej(                  d#d$d%e#d&d'd(d'dede$fd)Z%	 	 	 	 	 	 	 	 dded*ee   d+ej                  d,ee
eej(                  ge&e   f      d%ee#   dee   deeeef      d-eej(                     d.eej(                     de#fd/Z'	 dded0ee(   d1ed'   de(fd2Z)d3ee#e(f   d4ee#e(f   dee#e(f   fd5Z*	 	 dd6ej(                  d7eej(                     d8eej(                     d9e	dej(                  f
d:Z+d; Z,deeef   fd<Z-d= Z.d> Z/	 ddee   d?ee	   d@edeeef   fdAZ0dB Z1deeeeeef   f      fdCZ2dDededEedej:                  def
dFZ3e4de	fdG       Z5deded#d$dedHedej:                  de	fdIZ6de	fdJZ7	 	 ddedKee	   deeej:                  ef      fdLZ8dedede	fdMZ9 ejt                         	 	 	 	 	 	 	 	 	 	 	 	 ddeej(                     dee   d%ee#   d0ee(   d,ee
eej(                  ge&e   f      dNee	   d#ed$   dOedP   d-eej(                     d.eej(                     d?ee	   dQee   dee;ej                  f   fdR       Z<dSe	dNe	dej:                  de	fdTZ=	 ddej                  d1ed'   dej                  fdUZ>dej                  dVeee&e   f   d%e#d0e(dedNe	dOdPdee?ej                  f   fdWZ@ ejt                         dej                  d%e#d0e(dedNe	dOedP   dee?ej                  f   fdX       ZAdej                  d%e#d0e(dedNe	dOedP   dee?ej                  f   fdYZBedZej(                  dej(                  fd[       ZCedZej(                  ded\edej(                  fd]       ZDedZej(                  d8ej(                  dej(                  fd^       ZEed_ej(                  d`ej(                  daej(                  dbej(                  dceddedeedfee	ef   dgeFfdh       ZGed_ej(                  dbej(                  diej(                  dfee	ef   fdj       ZHdkej(                  dlej(                  dmej(                  dcedeedne	doed\edpededeej(                  ej(                  ej(                  f   fdqZIdrej(                  dsej(                  dtej(                  diej(                  d\edeej(                  ej(                  ej(                  f   fduZJd6ej(                  dsej(                  daej(                  drej(                  d8ej(                  dtej(                  d_ej(                  dbej(                  diej(                  dvej(                  d\edcedeedgeFdfee	ef   deej(                  ej(                  ej(                  ej(                  f   f dwZKdej                  d%e#d0e(dedNe	deeLej                  f   fdxZMdej                  dyeNd%e#d0e(dedNe	fdzZOdej                  d{ePd%e#d0e(dedNe	deeLej                  f   fd|ZQdej                  d}e$d%e#d0e(dedNe	dOedP   dee?ej                  f   fd~ZRdej                  defdZSy)GenerationMixina	  
    A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes.
    Inheriting from this class causes the model to have special generation-related behavior, such as loading a
    `GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI.

    A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it
    has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which
    approximately shares the same interface to public methods like `generate`. Three examples:
        - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public
            methods in the mixin;
        - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as
           `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls
           `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should
           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
            `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* if `num_beams=1` and `do_sample=False`
        - *contrastive search* if `penalty_alpha>0` and `top_k>1`
        - *multinomial sampling* if `num_beams=1` and `do_sample=True`
        - *beam-search decoding* if `num_beams>1` and `do_sample=False`
        - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
        - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
        - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`

    To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    Npretrained_model_name_or_pathtrust_remote_codereturnc                    t         j                  j                  |      }d}|r@t         j                  j                  t         j                  j                  |d            sd}nt	        |d      sd}|st        d| d      d| d}t        |||| |       t        |fd	d
i| t        |fddi|}t        d|      }|S )at  
        Loads and returns a custom generate function, given a model repo.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            trust_remote_code (`bool`, *optional*):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            **kwargs:
                Additional keyword arguments for remote code loading.

        Raises:
            OSError: If `pretrained_model_name_or_path` does not contain a `custom_generate` subdirectory.

        Returns:
            A callable that can be used to generate text.
        Tzcustom_generate/generate.pyF`zw` does not contain a `custom_generate` subdirectory with a `generate.py` file, can't load the custom generate function.zThe repository `zS` contains custom generation code that will override the default `generate` method.)has_local_codehas_remote_codeerror_messagerequirements_filez custom_generate/requirements.txtmodule_filegenerate)
ospathexistsjoinr	   OSErrorr   r   r   r   )	selfr   r   kwargsis_local_codehas_custom_generate_folderr   modulecustom_generate_functions	            rx   load_custom_generatez$GenerationMixin.load_custom_generate  s   : 'DE%)"77>>"'',,/LNk"lm-2*<>[\-2*)12 3O O  <= >- - 	 	")( --'	
 	")	
=_	
ci	
 ()
7T
X^
 $7z6#J ''rw   	input_idsinputs_embedscache_positionc                 ^   t               r| j                  |||      S |/|j                  d   dk(  r|dd|j                  d    df   }||fS ||d   |j                  d   k\  r|dd|j                  d    df   }||fS |j                  d   |j                  d   k7  r	|dd|f   }||fS )a  
        Generic cache-dependent input preparation
        The code is put in a separate function to allow granular unit testing
        as it needs a different implementation to be exportable.

        If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        - Exception 1: when passing input_embeds, input_ids may be missing entries
        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
        - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
          generate the first token for each sequence. Later use the generated Input ids for continuation.

        The current implementation does not rely on ``self`` and could be
        a class method. It is left as a standard method to be easily rewritten.
        Nr&   r   )r$   ,_cache_dependant_input_preparation_exportingshape)r   r   r   r   s       rx   "_cache_dependant_input_preparationz2GenerationMixin._cache_dependant_input_preparation  s    * $%DDYP]_mnn$);q)@)!n.B.B1.E-E-G*GHM i'' %r"iooa&88!!n&:&:1&=%=%?"?@I i'' __Q>#7#7#::!!^"34Ii''rw   c                     ||dd|f   }||fS d d d t        j                  |j                  d   dk(  fdfd|||g      \  }}||fS )	z
        This method implements method ``_cache_dependant_input_preparation``
        with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
        The code is put in a separate function to allow granular unit testing.
        Nc                 P    | d d |j                   d    d f   j                         S Nr   r   clone)r   r   s     rx   branch_1zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_1  s,    $Q)=)=a)@(@(B%BCIIKKrw   c                 P    | d d |j                   d    d f   j                         S r   r   r   r   s     rx   branch_2zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_2  s,     ^%9%9!%<$<$>!>?EEGGrw   c                 0    | d d |f   j                         S Nr   r   s     rx   branch_3zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_3  s     N!2399;;rw   r&   r   c                 6     ||      | j                         fS r   r   )r   r   r   r   s      rx   <lambda>zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>  s     ?!)F rw   c                 h    |t        j                  |d   | j                  d   k\  fd| |g      fS )Nr   r&   c                 t    t        j                  | j                  d   |j                  d   k7  d | |g      S )Nr&   r   c                 "    | j                         S r   r   r   s     rx   r   zrGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>  s    yGX rw   rq   condr   )r   r   r   s     rx   r   z`GenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>  s>     %

$-OOA$6.:N:Nq:Q$Q$,%X%.$?	!" rw   r   )r   r   r   r   r   s      rx   r   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>  sB    %

*2.)//!2DD$ '7F rw   r   )r   r   r   r   r   r   r   s       @@@rx   r   z<GenerationMixin._cache_dependant_input_preparation_exporting  s      !!^"34Ib i''KLH< (-zz"a'" M>:5($M98 i''rw   ra   attention_maskc           
      ^   i }||d<   |||d<   | j                  |||      \  }}| j                  j                  rdnd}| j                  j                  sQ|&t        |      |j                  d   k(  rd||<   ||d<   nL|j                  t        j                        ||<   d|d<   n#|j                  t        j                        ||<   | j                  j                  r|nd}	| j                  j                  r|j                  d	d      n|}| j                  j                  rd	nd
}
| j                  j                  rdnd}||j                  |      ~|t        t        j                  | j                        j                  j                               v r<|j!                         j#                  d      dz
  }|j%                  |dk(  d       |||<   dD ]~  }|j                  |      }||a|j                  d      |d   j                  d   n||   j                  d   }|dd| df   }|j                  t        j                        }|||<    t'        |t(              r[|j*                  rN|K|j,                  dk(  r;| j                  j                  s|d   |d   j                  \  }}}n||   j                  dd \  }}t/        | | j0                  |       }t3        |d      r|j5                         nd}t/        |dd      }||t/        |dd      }|p|j                  dd      }|j                  |d      }t/        | dt6              } || j                  t        j8                  ||f| j:                        |||||      }n2 ||||j=                         | j:                  ||| j                  |      }||||
<   |	|	|d
<   d| j                  j>                  v r| j@                  rt        jB                  | jD                  d}|d   dddf   }t        jF                  t        jH                  di ||j#                  d      jK                  d      gd      }tM        |jO                               dz   }|jQ                         \  }}|dk(  rt        jR                  |fi |n-|jU                  t        jB                        jK                  d      }t        jF                  t        jH                  di ||j#                  d      gd      }tM        |jO                               }|jW                  |jU                  | jD                        |jU                  | jD                        ||       |jY                         D ]  \  } }!| |vs|!|| <    |j                  dd       |S )a  
        Prepare the model inputs for generation. It includes operations like computing the 4D attention mask or
        slicing inputs given the existing cache.

        See the forward pass in the model documentation for expected arguments (different models might have different
        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
        r   Nra   decoder_input_idsr   r&   r   )memory_formatdecoder_attention_maskr   decoder_position_idsposition_idsr   r   )r   token_type_idsr   r   get_decoder5_prepare_4d_causal_attention_mask_with_cache_positionr   r   dtype)configinput_embedsr   r   ra   r   r   )sequence_lengthtarget_lengthr   r   
batch_sizer   ra   flashr   device)cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_klabels)r&   )-r   r   is_encoder_decoderlenr   r   rq   contiguous_formatpopgetsetinspect	signatureforward
parameterskeyslongcumsummasked_fill_
isinstancer   is_compileablendimgetattrbase_model_prefixhasattrr   r   emptyr   get_max_cache_shape_attn_implementation_supports_attention_backendint32r   catzerosaddintmaxsizeonestoupdateitems)"r   r   ra   r   r   r   r   model_inputsinput_ids_keyencoder_attention_maskattention_mask_keyposition_ids_keyr   model_input_namemodel_inputcurrent_input_lengthr   r   _
base_modeldecodercausal_mask_creation_functionr   
tensor_kwsposr   r   bsseq_lenq_lenr   r   keyvalues"                                     rx   prepare_inputs_for_generationz-GenerationMixin.prepare_inputs_for_generation&  s   $ )7%& &.=L*+'+'N'N=.($M9
 04{{/M/M+S^{{--(S-@MDWDWXYDZ-Z.2]+0=_- /8ooELcLco.d]+04_-*3//H_H_/*`L' 48;;3Q3QW[:>++:X:XFJJ/6^l 	 :>9W9W5]m59[[5S5S1Yg&

+,4 C(9(9$,,(G(R(R(W(W(Y$ZZ)..077;a?L%%n&91='3F#$ ![ 	= **%56K&". (++O<H %_5;;A>)-8>>qA )
 #.a2F1F1G.G"HK"-"3"3%BYBY"3"ZK1<-.	=  ...*##q(;;11l?6S6_1=o1N1T1T.
OQ.:=.I.O.OPRQR.S+
O !t'='=tDJ29*m2Tj,,.Z^G,3SUY-) -49L07TVZ1-
 -4!-!1!12BD!I+//0@$G07>Y[t0u-!>;;!&j/-JRVR\R\!]#1#1$3!-#1	" "?"$3"1"E"E"G**#1);;$3	" %/=L+,!--CL)*dkk6664;[;[#(;;$++FJ~.q"u5C!IIu{{'C
'CSZZPQ]EVEVWXEY&Z\]^Mswwy>A-L#..*KB4;qLEJJr0Z0cffU[[FYF]F]^_F`E!IIu{{'C
'CU\\RS_&UWXYMuyy{+L+..t{{;+..t{{;))	    !,,. 	*JC,&$)S!	*
 	4(rw   inputsbos_token_idmodel_kwargsc                 h   | j                   j                  rFt        | d      r:| j                  j                  | j                  k7  r| j                  j                  }n| j                  }|j                         D ci c]  \  }}|||k7  s|| }}}|j                  |d      }||t        d| d| d| d| d	      ||}|dk(  rd	|v r|d	   |j                  d	       n| j                   j                  sd	t        t        j                  | j                        j                  j                               v }|s#t        d
| j                  j                   d      | j!                  |||      |d<   |d	   d	}}n|t        d      |d	   d	}}| j!                  |||      }|||fS c c}}w )zT
        This function extracts the model-specific `inputs` for generation.
        encoderNz
`inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...r   r   zAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r  zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)r   r   r   r  main_input_namer   r   
ValueErrorr   r   r   r
  r   r   	__class__rm   *_maybe_initialize_input_ids_for_generation)	r   r  r  r  
input_namekvinputs_kwarghas_inputs_embeds_forwardings	            rx   _prepare_model_inputsz%GenerationMixin._prepare_model_inputs  s    KK**i(,,0D0DD55J--J)5););)=bARSWaRa1bb $''
D9#(:VH$<ZL I,,284
|4I  %!F $L)HO,4  1[[33/>#%%d&H&HITTYY[C 0, 4$[\`\j\j\s\s[t ux x  -1,[,[L| -\ -[) &2/%BO
%$%tuu%1/%BO
 @@Wcdz<//[ cs   <F.F.c                 p   ||S |j                  d      }| j                  j                  rR|P|j                  j	                         dd }t        j                  |t
        j                  | j                        dz  S d}|j                         D ]-  }t        |t
        j                        s|j                  d   } n d|v r2t        j                  |dft
        j                  | j                        S |t        d	      t        j                  |dft
        j                  | j                        |z  S )
z3Initializes input ids for generation, if necessary.Nencoder_outputsr   r   ir&   r   r   zB`bos_token_id` has to be defined when no `input_ids` are provided.)r   r   r   last_hidden_stater   rq   r   r   r   valuesr   Tensorr   r  )r   r  r  r  r  r   r   r	  s           rx   r  z:GenerationMixin._maybe_initialize_input_ids_for_generation  s
    M&**+<=;;))o.I#55::<SbAE::e5::dkkJTQQ 
!((* 	E%."[[^
	
 l*::z1oUZZTTabbzz:q/DKKPS___rw   inputs_tensorgeneration_configc                 n   |j                   }|j                  }d|v r|d   j                  d   dkD  r|d   }t        j                  |j                  d d t        j
                  |j                        }||S t        |j                        dk(  xr, |j                  t        j                  t        j
                  fv }|s|S |d uxr t        ||      j                         }|d u xs t        ||      j                          }	||	z  }
|j                  |      j                         }||
z  ||
 z  z   }|S )Nr   r&   r   r   r   elementstest_elements)_pad_token_tensor_eos_token_tensorr   rq   r   r   r   r   r   r   r   anyne)r   r  r   r  pad_token_ideos_token_iddefault_attention_maskis_input_idsis_pad_token_in_inputs&is_pad_token_not_equal_to_eos_token_idcan_infer_attention_maskattention_mask_from_paddingr   s                rx   &_prepare_attention_mask_for_generationz6GenerationMixin._prepare_attention_mask_for_generation&  s_    )::(:: ,&<+D+J+J1+MPQ+Q(5M "'M,?,?,C5::^k^r^r!s))=../14g9L9LQVQZQZ\a\f\fPg9g))".d": "
}LQUUW 	 3?$2F 2
|<PTTVL
. $:<b#b &3&6&6|&D&I&I&K# (*BBE[_w^wEww 	 rw   r   c                    | j                         }t        | d      r4t        |d      rd|j                  _        nt	        |t        d             g d}|j                         D ci c]  \  }t        fd|D              s| }	}}t        t        j                  |j                        j                        }
d|
v xs d|
v }|s(|	j                         D ci c]  \  }}||
v s|| }	}}|j                  |	d	<   |j                  |	d
<   ||n| j                  }d|	d<   ||	|<    |di |	|d<   |S c c}}w c c}}w )Nhf_device_map_hf_hookT)io_same_device)decoder_
cross_attn	use_cachec              3   @   K   | ]  }j                  |        y wr   )
startswith).0parguments     rx   	<genexpr>zQGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<genexpr>`  s     I!x**1-Is   r   r  output_attentionsoutput_hidden_statesreturn_dictr  rv   )get_encoderr   r4  r5  r`   r_   r   r'  r   r   r   r   r   r?  r@  r  )r   r  r  r   r   r  irrelevant_prefixr=  r	  encoder_kwargsencoder_signatureencoder_accepts_wildcards          `    rx   ._prepare_encoder_decoder_kwargs_for_generationz>GenerationMixin._prepare_encoder_decoder_kwargs_for_generationJ  sy    ""$ 4)w
+26  /"7,<D,QR D $0#5#5#7
 
%I7HII eO
 

   1 1'// B M MN#+/@#@#gNVgDg '7E7K7K7M$3HeQY]nQn%N  /@.Q.Q*+1B1W1W-. 0@/K+QUQeQe(,}%+8'(7>7P7P&')
s   *!D9D?,D?r   decoder_start_token_idr   c                    |d|v r|j                  d      }nd|v r|dk7  r|j                  d      }nd}|| j                  }|j                  dk(  rC|j                  d   |k7  rt	        d| d|j                  d          |j                  dd      }n+t        j                  |dft        j                  |	      |z  }||}||fS d
| j                  j                  j                         v sI| j                  j                  dk(  r5d
| j                  j                  j                  j                         v r	 ||fS | j                  j                  dv r	 ||fS |dddf   |dddf   k7  j                         j!                         r\t        j"                  ||gd      }d|v r?|d   }t        j"                  t        j$                  |      ddddf   |fd      }||d<   ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNr   r   r&   r   z1`decoder_start_token_id` expected to have length z	 but got r   r   donutzvision-encoder-decoder)whisperdimr   )r   r   r   r   r  viewrq   r   r   r  rm   lowerr   
model_typer  allitemr   	ones_like)r   r   r   r  rH  r   r   r   s           rx   )_prepare_decoder_input_ids_for_generationz9GenerationMixin._prepare_decoder_input_ids_for_generations  s(    #(;|(K , 0 01D EL(-=-L , 0 0 = $ >[[F!&&!+%++A.*< G
|S\]s]y]yz{]|\}~  &<%@%@Q%G" 

J?%**VLOee # $ 6, !,..% //5577KK""&>>7dkkNaNaNlNlNrNrNtCt !,.. [[##{2 !,..  1%)?1)EEJJLQQS %		+ACT*U[] ^'<7)56N)O&).__%;<QUCE[\*& :P56 ,..rw   expand_sizer   c                       dk(  r||fS  fd}||j                   d      } ||      }|r*|j                  d      t        d       ||d         |d<   ||fS )zIExpands tensors from [batch_size, ...] to [batch_size * expand_size, ...]r&   c                     | D ]E  }|dk7  s	| |   t        | |   t        j                        s-| |   j                  d      | |<   G | S )Nr   r   rL  )r   rq   r  repeat_interleave)dict_to_expandr  rU  s     rx   _expand_dict_for_generationzRGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generation  sd    % d++&s+7">##6E*8*=*O*OP[ab*O*cN3'd "!rw   r   rL  r  zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)rX  r   r  )rU  r   r   r  rZ  s   `    rx   _expand_inputs_for_generationz-GenerationMixin._expand_inputs_for_generation  s     !l**	"  !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+,&&rw   outputsnum_new_tokensc                    t         D ]   }||v s|dv rd}n|}t        ||      ||<    n d|v r7|d   }t        j                  ||d d df   j	                  d      gd      |d<   |sDd|v r|d   }t        j                  ||j                  |j                  d   df      gd      |d<   nCd	|v r?|d	   }	t        j                  |	|	j                  |	j                  d   df      gd      |d	<   |j                  d
d      r|d   dd  |z   |d<   |S |j                  d      }
t        j                  |
d   dz   |
d   |z   dz   |
j                        j                  |
j                        }t        j                  |
|f      |d<   |S )N)re   rd   ra   r   r   rL  r   r   r&   r   r8  Tr   r   )ALL_CACHE_NAMESr   rq   r   	unsqueezenew_onesr   r   r   aranger   r   r   )r   r\  r  r   r]  possible_cache_name
cache_namer   r   r   past_positionsnew_positionss               rx   #_update_model_kwargs_for_generationz3GenerationMixin._update_model_kwargs_for_generation  s    $3 	"g-&*II!2J!4J+27<O+PZ(	 |+)*:;N-2YYWXZ\W\H]HgHghjHk7lrt-uL)*!</!-.>!?16#^%<%<n>R>RST>UWX=Y%Z[ac2-.
 (<7)56N)O&9>+-C-L-LNdNjNjklNmopMq-rs:56
 K.-9:J-KBC-PSa-aL)*  *--.>?N!LLr"Q&r(:^(Ka(OWeWkWkb&&'  .3YY7V-WL)*rw   assistant_modelr\   logits_processortarget_tokenizerr]   assistant_tokenizerc	                    t        d |||fD              }	|j                  t        || ||||      }
|
S |j                  9t	        |j
                  |j                  |j                  |j                        }
|
S |	r|j                  du rct        j                  ||| j                  j                         j                  |d      }d|j                  _        t!        |||||||||	      }
|
S |j                  du rt#        ||||||||	      }
|
S t%        d
t'        |j                        j(                         t+        ||||||      }
|
S )zU
        Returns the candidate generator to be used in `assisted_generation`
        c              3   $   K   | ]  }|d u 
 y wr   rv   )r;  r  s     rx   r>  z;GenerationMixin._get_candidate_generator.<locals>.<genexpr>  s     "sQ1D="s   N)r   rh  r   r  r  ri  )r*  num_output_tokensmax_matching_ngram_size
max_lengthT)rh  assistant_prune_lm_head)	r   rh  r   r  r  ri  rj  rk  atm_translatorF)r   rh  r   r  r  ri  rj  rk  z7Invalid value for `do_sample`: expected a boolean, got )rQ  assistant_early_exitr0   prompt_lookup_num_tokensr1   r&  rp  rq  	do_sampler,   get_translatorr   get_text_config
vocab_sizer   repetition_penaltyr2   r.   r  typerm   r-   )r   r   r   r  rh  ri  rj  rk  r  different_tokenizerscandidate_generatorrs  s               rx   _get_candidate_generatorz(GenerationMixin._get_candidate_generator  s     #"s?L\^q:r"ss11="=# $"3)+!1#z #"k 77C"@.@@"3"L"L(9(Q(Q,77	#h #"] " **d2!>!M!M$'KK//1<<$3,0" HL11D&K'$3&7!-"/%5%5(;#1
'#F #"1 #,,5&S'$3&7!-"/%5%5(;	'#. #" !MdSdSnSnNoNxNxMyz  #=# /"3)+!1# #"rw   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnnegative_prompt_idsnegative_prompt_attention_maskc
           	      Z   t               }
|g }|j                  B|j                  dk7  r3|
j                  t        |j                  | ||	|j                               |j
                  %|
j                  t        |j
                               |j                  J|j                  dkD  r;|
j                  t        |j                  |j                  |j                               |j                  h|j                  dk7  rYt        |j                        dk(  r'|
j                  t        |j                  |	             nt        j                   d
t"               |j$                  4|j$                  dk7  r%|
j                  t'        |j$                               |j(                  3|j(                  dkD  r$|
j                  t+        |j(                               |j,                  g|j,                  dkD  rXt        |j                        dk(  r&|
j                  t/        |j,                  |             nt        j                   dt"               |j0                  /|
j                  t3        |j0                  |j4                               |j6                  Mt9        |dd      @|j6                  dkD  r1|
j                  t;        |j6                  |j4                  |             |j<                  Nt9        |dd      A|j<                  dkD  r2|
j                  t?        ||j<                  |j4                  |             |2|
j                  tA        ||j                  |j                  z               |jB                  $|
j                  tE        |jB                               |jF                  1|
j                  tI        |jJ                  |jF                  |             |jL                  du r|
j                  tO                      |jP                  0|
j                  tS        |jP                  |j4                  |             |jT                  &|
j                  tW        |jT                  |             |jX                  A|}|dkD  s|jB                  |n|dz   }|
j                  t[        |jX                  ||             | j]                  |
|      }
|j^                  rQ|j                  dkD  rwta        |j4                  tb              rt        |j4                        dz   }nFta        |j4                  td        jf                        r|j4                  j                  d   dz   }nd}nd}|jh                  3|jh                  dk7  r$|
j                  tk        |jh                               |jl                  5|jl                  dk7  r&|
j                  to        |jl                  |             |jp                  5|jp                  dk  r&|
j                  ts        |jp                  |             |jt                  &|
j                  tw        |jt                  |             |jx                  5|jx                  dk  r&|
j                  t{        |jx                  |             |j|                  >d|j|                  cxk  rdk  r)n n&|
j                  t        |j|                  |             |j                  ?d|j                  cxk  rdk  r*n n'|
j                  t        |j                  ||             |j                  M|
j                  |j                  j                  | j                  j                         j                  |             |j                  du r|
j                  t                      |
S )z
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
        instances used to modify the scores of the language model head.
        Nr&   )unconditional_idsunconditional_attention_maskr8  sequence_bias        )diversity_penalty	num_beamsnum_beam_groups      ?r   )penaltyr  zyPassing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.)r  r   z{Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.r&  r   T)top_kmin_tokens_to_keep)top_pr  )min_pr  )massr  )epsilonr  )r  r  r   )IrE   guidance_scaleappendrT   r8  r  rM   r  rB   r  r  encoder_repetition_penaltyr   r   r<   warningswarnUserWarningrz  rL   no_repeat_ngram_sizerJ   encoder_no_repeat_ngram_sizer;   bad_words_idsrI   r&  
min_lengthr   rF   min_new_tokensrG   rK   forced_bos_token_idr@   forced_eos_token_idrA   rq  remove_invalid_valuesrC    exponential_decay_length_penaltyr?   suppress_tokensrO   begin_suppress_tokensrN   _merge_criteria_processor_listrv  r   listrq   r  temperaturerP   r  rQ   r  rR   r  rH   	typical_prS   epsilon_cutoffr=   
eta_cutoffr>   watermarking_configconstruct_processorr   rx  ry  renormalize_logitsrD   )r   r   r  r  r  ri  r   r  r  r  
processorsbegin_indexr  s                rx   _get_logits_processorz%GenerationMixin._get_logits_processorR  s2   " )*
#!++7<M<\<\`a<a>%44&91O/99 **69HYHgHghi..:?P?b?beh?h/&7&I&I/99$5$E$E 88D!<<C$**+q0!!; 1 L L*; 9
 //;@Q@d@dhk@k>GXGkGklm11=BSBhBhklBl:;L;a;abc::F!>>B$**+q0!!7)FF) 9
 **6)%33%77 ((4)+>EQ!,,q0(%00%77! ,,8)+>EQ!00141(%44%77!	 $/0,%//3D3T3TT 00<-%99
 00<-%00%99! 22d:9;<==I-%FF%77( ,,8-%55! 22>.K )1,0A0U0U0]  1_ 
 4%;;! 88EUV
 && !**Q./AA4H),->-P-P)QTU)U& 1 C CU\\R):)L)L)R)RST)UXY)Y&)*&%&" !,,8=N=Z=Z^a=a!!"9:K:W:W"XY &&27H7N7NRS7S!!$+<+B+BWij !&&27H7N7NQT7T!!$+<+B+BWij !&&2!!$+<+B+BWij !**6;L;V;VY\;\!!'->-H-H]op !//;FWFfFf@lil@l!!' 1 @ @Ug
 !++7CBSB^B^<dad<d!!# 1 < <Qclr 00<!55IIKK//1<<f //47023rw   stopping_criteria	tokenizerc                    t               }|j                  =t        | j                  dd       }|j	                  t        |j                  |             |j                  %|j	                  t        |j                               |j                  3|t        d      |j	                  t        |j                  |             |j                  %|j	                  t        |j                               |j                  r@|j                  4|j                  dkD  r%|j	                  t        |j                               | j!                  ||      }|S )	Nmax_position_embeddings)rq  r  )max_timea  There are one or more stop strings, either in the arguments to `generate` or in the model's generation config, but we could not locate a tokenizer. When generating with stop strings, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.)stop_stringsr  )r*  r   )assistant_confidence_threshold)rZ   rq  r   r   r  rW   r  rX   r  r  r[   r&  rV   is_assistantr  rU   r  )r   r   r  r  r   criteriar  s          rx   _get_stopping_criteriaz&GenerationMixin._get_stopping_criteria7  s3    ()''3&-dkk;TVZ&[#OO!0;;,C %%1OOO5F5O5OPQ))5  s 
 OO.<M<Z<Zfopq..:OO,:K:]:]^_**!@@L!@@1DOO"BSBrBrs 66xARSrw   default_listcustom_listc                    t        |      dk(  r|S  t        |             }|D ]  }d}|D ]~  }t        |      t        |      u st        |t              rdnd}t        j                  d| dt        |       dt        |       dt        |       d		       |j                  |       d
} n |r|j                  |        |D ]  }||vs|j                  |        |S )a4  
        Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
        processor/criteria is present on both lists, use the user-defined one.

        (Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.)
        r   Fzstopping criteriazlogits processorz	A custom z	 of type zt has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom z5 will take precedence. Please check the docstring of z$ to see related `.generate()` flags.T)r   r{  r   rY   loggerwarning_oncer  )r   r  r  
final_listdefaultusing_customcustomobject_types           rx   r  z.GenerationMixin._merge_criteria_processor_list^  s    {q 'T,')
# 	+G L% <4=09CFL\9]"5cuK''#K=	$v, Heeijpeqdr sOOSTZ|n ]// %%f-#'L  !!'*	+" " 	*FZ'!!&)	* rw   rh   ri   r   normalize_logitsc                    |it        j                  |d   j                  d         j                  dd      j	                  |j
                        }|j                  dt        |            }t        j                  |      j                  t        |      d      j                  dd      }|r|j                  d| j                  j                         j                  |j                  d         }t         j                  j                  j!                  |d      }|j                  d|j                  d         }|dk  }d|j#                         z
  j%                  d      j'                         }|j)                         ddd|f   }|ddd|f   }d||<   || j                  j                         j                  z  }|j                  d   |z
  }|dd|df   |z   }	|j+                  d|	      }
d|
|<   |
S )a  
        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.

        Parameters:
            sequences (`torch.LongTensor`):
                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
                shorter if all batches finished early due to the `eos_token_id`.
            scores (`tuple(torch.FloatTensor)`):
                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
                of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
                Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
                with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
            beam_indices (`torch.LongTensor`, *optional*):
                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
                generate-time.
            normalize_logits (`bool`, *optional*, defaults to `False`):
                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

        Return:
            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
                the transition scores (logits)

        Examples:

        ```python
        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
        >>> import numpy as np

        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
        >>> inputs = tokenizer(["Today is"], return_tensors="pt")

        >>> # Example 1: Print the scores for each token generated with Greedy Search
        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, normalize_logits=True
        ... )
        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
        >>> # encoder-decoder models, like BART or T5.
        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
        >>> generated_tokens = outputs.sequences[:, input_length:]
        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
        ...     # | token | token string | log probability | probability
        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
        |   262 |  the     | -1.414 | 24.33%
        |  1110 |  day     | -2.609 | 7.36%
        |   618 |  when    | -2.010 | 13.40%
        |   356 |  we      | -1.859 | 15.58%
        |   460 |  can     | -2.508 | 8.14%

        >>> # Example 2: Reconstruct the sequence scores from Beam Search
        >>> outputs = model.generate(
        ...     **inputs,
        ...     max_new_tokens=5,
        ...     num_beams=4,
        ...     num_return_sequences=4,
        ...     return_dict_in_generate=True,
        ...     output_scores=True,
        ... )
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        ... )
        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
        >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
        >>> # use case, you might want to recompute it with `normalize_logits=True`.
        >>> # Tip 2: the output length does NOT include the input length
        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
        >>> length_penalty = model.generation_config.length_penalty
        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
        True
        ```Nr   r   r&   rL  )rq   rb  r   rN  r   r   expandr   stackreshape	transposer   rx  ry  r   r   log_softmaxr   sumr   r   gather)r   rh   ri   r   r  beam_indices_maskmax_beam_lengthbeam_sequence_indicescut_idxindicestransition_scoress              rx   compute_transition_scoresz)GenerationMixin.compute_transition_scores  s   h  <<q	(:;@@QGJJ9K[K[\L'..r3v;?L V$,,S["=GG1M ^^B(C(C(E(P(PRXR^R^_aRbcFXX((44V4CF^^BR(89F )1,05577<<R@DDF#))+A/?/?,?@-a1A/1A.AB +,&' !-t{{/J/J/L/W/W W //"%7AwxK(+@@ #MM!W5 01+,  rw   c                     y  j                   j                  rcj                   j                  sMg d}t        j                         D cg c]	  }||v s| }}t         fd|D              }|st	        d      d} j                   j                         j                  j                   j                         j                  k(  r|t	        d| d      y ||t	        d| d      y c c}w )N)encoder_attention_headsencoder_ffn_dimencoder_layersc              3   x   K   | ]1  }t        j                  |      t        j                  |      k(   3 y wr   )r   r   )r;  attrrh  r   s     rx   r>  z6GenerationMixin._validate_assistant.<locals>.<genexpr>  s3      X\T*go6L6Ld.SSs   7:zThe main model and the assistant don't have compatible encoder-dependent input shapes. Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper.zc(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)z`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` .z~The main and assistant models have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` )r   r   dirrQ  r  rx  ry  )r   rh  r  rk  attributes_to_checkr  	are_equaldoc_references   ``      rx   _validate_assistantz#GenerationMixin._validate_assistant  sB   ";;))/2H2H2[2["b478N8N4O"oDSW[nSn4"o"o `s I  J  r 	 ;;&&(337M7M7]7]7_7j7jj".  l  mz  l{  {|  }  /
  $7$?  U  Vc  Ud  de  f  %@' #ps   	C0C0c                    | j                   j                  rdD ]  }|j                  |d        g }t        t	        j
                  | j                        j                        }d|v sd|v r5|t        t	        j
                  | j                        j                        z  }| j                   j                  rt        | | j                  d      }t        | dd      }||t        |dd      }|7t        t	        j
                  |j                        j                        }||z  }t        | dd      }||t        |dd      }|Ht        t	        j
                  |j                        j                        }	||	D 
ch c]  }
d|
 	 c}
z  }|j                         D ]  \  }}|	||vs|j                  |         |rt        d| d	      yc c}
w )
zXValidates model kwargs for generation. Generate argument typos will also be caught here.)r   Nr   r  r  r  r6  z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))r   r   r   r   r   r   r
  r   r   r   r   r   r  r  )r   r  r  unused_model_args
model_argsr   r  encoder_model_argsr  decoder_model_argsxr	  s               rx   _validate_model_kwargsz&GenerationMixin._validate_model_kwargs  s    ;;)), ,  d+, **4+M+MNYYZ
 z!^z%A#g//=HHIIJ ;;)) t'='=tDJ dIt4G :#9!*i>"%():):7??)K)V)V%W"00
 dIt4G:#9!*i>"%():):7??)K)V)V%W"7IJ!!~JJ
&,,. 	.JC S
%:!((-	. JK\J] ^F F   Ks   <Gc           	         |rC|j                   7|j                  dk(  r(t        j                  d|j                   dt               ||j                  k\  r9| j
                  j                  rdnd}t        d| d| d	|j                   d
      d}|r|d|j                   dz  }|j                  Q|j                  |j                  kD  r8t        j                  d|j                   d|j                   d|z   t               |j                  [|j                  |z   }||j                  kD  r<t        j                  d|j                   d| d|j                   d|z   t               yyy)z=Performs validation related to the resulting generated lengthN   z0Using the model-agnostic default `max_length` (=zz) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.r   r   zInput length of z is z, but `max_length` is set to z}. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.z Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.z" Note that `max_length` is set to z, its default value.z-Unfeasible length constraints: `min_length` (z.) is larger than the maximum possible length (z).z1Unfeasible length constraints: `min_new_tokens` (z$), when added to the prompt length (z/), is larger than the maximum possible length ()
max_new_tokensrq  r  r  r  r   r   r  r  r  )r   r   input_ids_lengthhas_default_max_lengthinput_ids_stringmin_length_error_suffixr  s          rx   _validate_generated_lengthz*GenerationMixin._validate_generated_lengthL  s    "&7&F&F&NSdSoSosuSuMMBCTC_C_B` a  	 0;;;6:kk6T6T2Ze"#3"4D9I8J K%001 2UU + 	  "#45F5Q5Q4RRfg# ''38I8T8TWhWsWs8sMM?@Q@\@\?] ^11B1M1M0NbRTkl
 ++7*99<LLJ-888GHYHhHhGi j33C2D E55F5Q5Q4RRTVXop  	 9 8rw   c                    |j                   S|s<|j                  0t        j                  d|j                    d|j                   d       |j                   |z   |_        n|dk(  rK||j                  d   k7  r9| j
                  j                  s#|xj                  |j                  d   z  c_        nk|ri|j                  t               j                  k(  rH|j                  |z   |_        t        | j
                  dd      }|t        |j                  |      |_        |j                  H|s0t        j                  d|j                   d	|j                   d
       |j                  |z   |_        |S |dk(  rS||j                  d   k7  rA| j
                  j                  s+t        |j                  |j                  d   z
  d      |_        |S )z]Prepared max and min length in generation configs to avoid clashes between similar attributesNzBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   r&   r  zBoth `min_new_tokens` (=z) and `min_length`(=z) seem to have been set. `min_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   )r  rq  r  warningr   r   r   r8   r   minr  r  r   )r   r   r  has_default_min_lengthr   r  r  r  s           rx   _prepare_generated_lengthz)GenerationMixin._prepare_generated_lengthx  s    ++7).?.J.J.V./@/O/O.PPd(334 5ff ,=+K+KN^+^(
 / M$7$7$::KK22((M,?,?,BB(# ++/?/A/L/LL/@/K/KN^/^!,*1$++?XZ^*_'*6367H7S7SUl3m%0 ++7)./@/O/O.PPd(334 5ff ,=+K+KN^+^( !  / M$7$7$::KK22+./@/K/KmNaNabcNd/dfg+h(  rw   use_model_defaultsr   c                    d}|| j                   j                  r| j                   j                  t        | j                         k(  rut	        | j
                  j                               dkD  rOt        j                  | j
                        }|| j                   k7  r!t        j                  dt               || _         | j                   }d}t        j                  |      }|st        j                  t        j                  | j                   j                         j"                        }|du s||t        j                  d      k\  ri }t               }| j                   }	|	j$                  j'                         D ]T  \  }
}|
j)                  d      s|
dk(  rt+        ||
d      }t+        ||
d      }||k(  s=||k7  sC|||
<   t-        ||
|       V |j.                  d	k(  rd|_        |t	        |      dkD  rt2        j5                  d
| d       n|j6                  | j                   j6                  |_        |j8                  | j                   j8                  |_        |j:                  | j                   j:                  |_        |j<                  | j                   j<                  |_         |j>                  di |}||fS )z
        Prepares the base generation config, then applies any generation configuration options from kwargs. This
        function handles retrocompatibility with respect to configuration files.
        FNr   a?  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed in v5. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )Tz4.50.0r   transformers_versionr  zX`generation_config` default values have been modified to match model-specific defaults: z=. If this is not desired, please set these values explicitly.rv   ) r   _from_model_config_original_object_hashhashr   r   &_get_non_default_generation_parametersr8   from_model_configr  r  r  copydeepcopyr
   parser  base_version__dict__r   r:  r   setattrr  rv  r  r  r  r*  r)  rH  r   )r   r   r  r   using_model_generation_confignew_generation_configmodel_base_versionmodified_values global_default_generation_configmodel_generation_configr  model_gen_config_valueglobal_default_valuecustom_gen_config_valuer  s                  rx   _prepare_generation_configz*GenerationMixin._prepare_generation_config  s    ).%$ &&99**@@DI_I_D``JJLMPQQ(8(J(J4;;(W%(D,B,BBMMB $ .CD* $ 6 6,0) !MM*;<, ")w}}T=S=S=h=h/i/v/v!w!T)"*/AW]]S[E\/\"$3C3E0*.*@*@'3J3S3S3Y3Y3[ 
P/C/~~c*c5K.K +23SUXZ^+_(.56Gd.S+/3GG26JJ/E, 138NO
P %00C727%/%-#o2F2J''r*++hj
 %119595K5K5X5X%2$119595K5K5X5X%2$119595K5K5X5X%2$;;C?C?U?U?l?l%< 0(//9&9 ,..rw   c                    d|v r|d   |S d|v rY| j                   j                  sCt        j                  |d   ddddf   t        j                        j                  d      dz
  }nd|v rY| j                   j                  rCt        j                  |d   ddddf   t        j                        j                  d      dz
  }n8t        j                  |t        j                  |      j                  d      dz
  }d}|j                  d	      ^|d	   }d}t        |t              s|d   d   j                  d
   }n,t        |d      r |j                         |j                         }||d }||d<   |S )zbCalculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past lengthr   Nr   r   r   r&   decoder_inputs_embedsr   ra   r   get_seq_length)r   r   rq   rS  int64r   r   r   r   r   r   r   r  )r   
seq_lengthr   r  r   past_lengthcaches          rx   _get_initial_cache_positionz+GenerationMixin._get_initial_cache_position	  ss    |+=M0N0Zl*4;;3Q3Q"__\/-J1aQR7-S[`[f[fgnnopqtuuN$49W9W-D EaAg NV[VaVabiijklopp  #ZZ
%++fU\\]^_bccN-.: !23EKeU+#Ahqk//2 01e6J6J6L6X#224+KL9N)7%&rw   c                    d}t        | d      rt        | j                  j                               dhk(  s(t        | j                  j                               ddhk(  rd}n0| j                  j                         D cg c]	  }|dvs| c}d   }| j                  j	                         D ci c]  \  }}||dv r|n| }}}|y| j
                  j                         j                  }t        |      dk(  r&d|v r"t        j                  t        |      |d         S i }t        | d	      rd}| j                         D ]  \  }}	|	| j                         u s|} n |t        d
      |D 
cg c]	  }
||
v s|
 }}
t        |      |k\  r-t        |      D ]  }|D ]  }
d| d|
 dv s||
   ||<       n~	 ||v r#t        j                  t        |      ||         }nVd|v r|j                  dd      d   }nt        d| d      Q|D ]'  }t        |      D ]  }d| d| dv s||   ||<    ' ) t        |      D ]  }||vst        d| d       |S c c}w c c}}w c c}
w )z
        Returns the device map for each decoder layer, to allocate the cache on the right device.
        Inspired from `dispatch_model` in accelerate.
        Nr3  cpudisk)r  r  r   r&    r   zw`model.get_decoder()` is not returning a named module of the model. This is unexpected, please open an issue on GitHub.r  zDecoder name z" not found in execution device mapzlayer z! has not been mapped to a device.)r   r   r3  r  r   r   rx  num_hidden_layersr   dictfromkeysrangenamed_modulesr   RuntimeErrorrsplit)r   execution_device_mapmain_devicednamer   r  layer_device_mapdecoder_namer   module_namedecoder_mapped_modulesidxlayers                 rx   $_get_layer_device_map_for_cache_initz4GenerationMixin._get_layer_device_map_for_cache_init%  s   
  $4)4%%,,./E7:c$BTBTB[B[B]>^chjpbq>q#*.*<*<*C*C*EbQRaIaqbcde %)$6$6$<$<$>$ D& V%>kFJ$  $  ' !KK779KK#$)b4H.H=='8!9;OPR;STT 4'L $ 2 2 4 fT--//#'L #"/  0D& +|WbGb&" &
 )*.?? !23 "C'= "se1:K=)::4H4U,S1!"" #';;+/==?P9QSghtSu+v(,'3':':3'B1'E*]<.Hj+kll  .  !23 C3%qzwa[00DU0K(- *+ 	TC**"VC50Q#RSS	T  C c$:&s   =	II.I!.	I'8I'cache_implementationmax_cache_lenc                    |dk(  rdt        | j                  dd      v rd}t        |   }| j                  j                  xs |j	                  d      du}t        | d      r$|r| j                  j                  n| j                  }|d	k(  r t        | j                  j                  |      }t        | d       xsG t        |       xs8 |j                  |k7  xs' t        |t        t        f      xs |j                  |k  }	|rFt        | d      r:|	xs6 | j                  j                  j                  |d   d
   j                   d   k7  }	|	rt        | j                  d      r| j                  j"                  }
n| j$                  }
| j'                         }| j                  j)                         |||
||d}|dv r|j+                  d| j,                  i        |di || _        |rI|j/                         }|d   d
   j                   d   |d<   t1        | j                   |di |      | _        | j                  S | j                  j3                          | j                  S )z
        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
        new `generate` call requires a larger cache or uses a different batch size.

        Returns the resulting cache object.
        hybridllama4rP  r  hybrid_chunkedr  N_cachesliding_windowr   r&   _pre_quantization_dtype)r   max_batch_sizer.  r   r   r&  )staticr0  offloaded_statictp_sizer.  rv   )r   r   r5   r   r   r   r3  self_attention_cacher  r4  r   r6  r   r   r.  cross_attention_cacher   r5  r   r,  rx  r   r9  r  r   reset)r   r-  r   r.  r   r  	cache_clsrequires_cross_attention_cachecache_to_checkneed_new_cachecache_dtyper&  cache_kwargsrD  s                 rx   
_get_cachezGenerationMixin._get_caches  s]     8+GDKKQ]_a<b0b#3 ;<PQ	KK**]l.>.>?P.QY].] 	' 4"A_T[[==eiepepN#33 : :MJM h'' <~y99<,,
:< !35I J< ++m; 	 *gdH.E r;;44BBlSdFefgFhFnFnopFqq 
 t{{$=>"kkAA"jj#HHJ++557",!.$ $4L $'OO##Y$=>#3l3DK-!-!2!2!42>?P2QRS2T2Z2Z[\2]/1$++y?Z>?Z[ {{ KK{{rw   c                 J      j                    xr t         fddD              S )a  
        Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
        This adds exception for some models like `Mamba` models which use their own caches
        and do not need to initialize the Cache in advance in order to save memory (because no back and forth
        `to_legacy_cache` and `from_legacy_cache` will be performed for mamba-based models).
        c              3   V   K   | ]   }|j                   j                         v " y wr   )rm   rO  )r;  special_model_nameclss     rx   r>  zBGenerationMixin._supports_default_dynamic_cache.<locals>.<genexpr>  s+      ,
" cll&8&8&::,
s   &))reformerminimaxxlnetlfm2)_is_statefulrQ  )rG  s   `rx   _supports_default_dynamic_cachez/GenerationMixin._supports_default_dynamic_cache  s1     ### 
 ,
',
 )
 	
rw   max_cache_lengthc                 T    t         fddD              }|sdnd} j                  j                  xs |j                  d      du}	|j                  |      }
|
k|j                  t        d| d      t        |
t              r? j                         r/|	st        j                  |
      nt        j                  |
      ||<   y|j                  d	u ry j                         s5|j                  (t        j                  d
|j                   dt               y|6|j                  *t         j#                  d|j                   d       d|_        |j                  xs' t%         j                  j'                  d      dd      |_        |j                  f|j                  t(        v rk|j                  dk(  r j*                  st        d       j-                  |j                  t/        |j0                  |j2                        |z  |||      ||<   y|j                  dk(  r j                  j                  s j                         st        d      |j4                  |j4                  nddi}t6        |d      }|d   dk(  rt9               st;        d      |d   dk(  rt=               st;        d       |di |||<   y|j                  dk(  rt?               ||<   y|j                  dk(  rt               ||<   yy|	s
t               nt        t               t                     ||<   y)z
        Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is
        instantiated, writes it to `model_kwargs`, under the name expected by the model.
        c              3   j   K   | ]*  }|j                   j                  j                         v  , y wr   )r  rm   rO  )r;  
class_namer   s     rx   r>  z@GenerationMixin._prepare_cache_for_generation.<locals>.<genexpr>  s*     tPZjDNN,C,C,I,I,KKts   03)mambafalconh1ra   rb   r  NzMPassing both `cache_implementation` (used to initialize certain caches) and `zB` (a Cache object) is unsupported. Please use only one of the two.FzThis model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to z) will be ignored.zRAn assistant model is provided, using a dynamic cache instead of a cache of type='z'.T)r  r-  r7  zThis model does not support `cache_implementation='static'`. Please check the following issue: https://github.com/huggingface/transformers/issues/28981)r-  r   r.  r   r  	quantizedzThis model does not support the quantized cache. If you want your model to support quantized cache, please open an issue and tag @zucchini-nlp.backendquantozYou need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. Please install it via  with `pip install optimum-quanto`HQQzYou need to install `HQQ` in order to use KV cache quantization with HQQ backend. Please install it via  with `pip install hqq`	offloadeddynamicrv   ) r'  r   r   r   r-  r  r   rt   rM  r   from_legacy_cacher   r8  r  r  r  r  r  r   rx  r5   _can_compile_fullgraphrC  r   r  num_return_sequencescache_configr6   r#   ImportErrorr"   r   )r   r   r  rh  r   rN  r   is_hybrid_cacherd  r>  user_defined_cacher]  cache_classs   `            rx   _prepare_cache_for_generationz-GenerationMixin._prepare_cache_for_generation  s~    t^stt.=&>
 KK**]l.>.>?P.QY].] 	' *--j9) 55A cdnco pT T  ,e49]9]9_ : !223EF,>>?QR Z(
  &&%/ 335 55ABBSBhBhAi j  	  &+<+Q+Q+]%::;2? 6:21B1W1W 2
[bKK'''57Mt\
. 11= 559YY$99XEdNiNi$Z  ,0??):)O)O"#4#>#>@Q@f@fgjtt"2!!- ,; ,Z( #77;F;;119]9]9_$M  )55A &22#X. 
 <L<ST	*h6?Z?\%S  "),5>N>P%H 
 ,7+F+FZ("77;F+9+;Z("779D+7>Z( E 6 (H $rw   c                     dt        t        j                  | j                        j                  j                               v S )z
        Return True if the current model supports the keyword argument `logits_to_keep` in forward()
        to save memory. Checking it in this way allows to avoid using a new model attribute.
        logits_to_keep)r   r   r   r   r   r   )r   s    rx   _supports_logits_to_keepz(GenerationMixin._supports_logits_to_keepC  s2    
  3w'8'8'F'Q'Q'V'V'X#YYYrw   kwargs_has_attention_maskc                     d fd	} ||j                   |      } ||j                  |      } ||j                  |      } ||j                  |      } j                  j
                  r||n|}| |j                  dk(  r|j                  d      }|9|7||st        j                  d       |d   }t        j                  d| d        j                  j
                  r|t        d      |4t        ||	      j                         r||st        j                  d
       |At        j                  |      s|dk  j                         rt        j                  d| d       ||_        ||_        ||_        ||_        y)a  
        Prepares the special tokens for generation, overwriting the generation config with their processed versions
        converted to tensor.

        Note that `generation_config` is changed in place and stops being serializable after this method is called.
        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
        Nc                     | | S ||nj                   }t        | t        j                        r| j	                  |      S t        j
                  | |t        j                        S )Nr   r   )r   r   rq   r  r   tensorr   )tokenr   r   s     rx   _tensor_or_nonez@GenerationMixin._prepare_special_tokens.<locals>._tensor_or_noneZ  sQ    }%1Vt{{F%.xx''<<fEJJGGrw   r  r   zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.z\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.r"  zThe attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z;`eos_token_id` should consist of positive integers, but is zq. Your generation will not stop until the maximum length is reached. Depending on other flags, it may even crash.r   )r  r*  r)  rH  r   r   r   r`  r  r  r  r   r'  r  rq   is_floating_point_bos_token_tensorr&  r%  _decoder_start_token_tensor)	r   r   rf  r   rl  bos_token_tensoreos_token_tensorpad_token_tensordecoder_start_token_tensors	   `        rx   _prepare_special_tokensz'GenerationMixin._prepare_special_tokensJ  s    	H ++<+I+IRXY*+<+I+IRXY*+<+I+IRXY%45F5]5]fl%m" ;;)).H.T*Zj '
 ',<,A,AQ,F/99!< #(8(D(4=Vq  02NNFGWFXXqrs ;;)).H.Pn  (!+;K[\``b(4=V##C
 '##$45:JQ:N9S9S9UNNMN^M_ `r r /?+.>+.>+8R5rw   c                 t   |j                   ry| j                  j                  dk(  xs/ t        |j                  duxr |j                  j
                        }t        |j                  d      t              xr |d   j                  }|xr |xr | j                  }t        | dd      || j                  j                  z  }t        | d      rGt        | j                  j!                               }d|v xr t#        |      dkD  }|| z  }d	|v }|| z  }|j                  |st$        j'                  d
       |S )zp
        Determines whether to trigger auto-compilation of the model's forward pass at generation time.
        FcudaNra   hf_quantizerr3  r  r&   r  zsYou have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.)disable_compiler   r{  boolcompile_config_compile_all_devicesr   r   r   r   r[  r   rw  r   r   r3  r  r   r  r  )	r   r  r   valid_hardwareusing_compilable_cachecan_compileall_model_deviceshas_cpu_offloadhas_disk_offloads	            rx   _valid_auto_compile_criteriaz,GenerationMixin._valid_auto_compile_criteria  sS   
 ,, ))V3 
t,,D8r=N=]=]=r=r8
 |''(9:EBu|TeGfGuGu 	 %_)?_DD_D_ 4.:4,,;;;K4) #D$6$6$=$=$? @#'88WSAR=SVW=WO..K  &)::///K ++7#
 rw   synced_gpusstreamerr^   custom_generatec                    |j                  dd      }|dh d}t               j                         D ci c]  \  }}||vs|| }}}|j                  |        | j                  |fd|i|} |d=d| i|S |j                  dd      }|j                  dd      } | j
                  |fi |\  }| j                  |j                                | j                  |||       |0t               xs t        |       xr t        j                         dkD  }||n	t               }||n	t               }dt        t!        j"                  | j$                        j&                  j)                               v }d	|v}|j+                  dd      du}| j-                  |j.                  |      \  }}}|j0                  d
   }|j2                  }| j5                  ||       | j6                  j8                  sjj:                  ^|dkD  rYt=        |j0                        dk(  rAt?        j@                  |dddf   j:                  k(        d
kD  rtB        jE                  d       | j6                  j8                  s|dk(  rd_#        |s|r|r| jI                  ||      |d<   n-|r+|dk(  r&t=        |d   j0                        dkD  rtK        d      | j6                  j8                  rd	|vr| jM                  |||      }| j6                  j8                  r.| jO                  |||jP                  |j2                        \  }}n|dk(  r|n|j                  d      }jR                  r| jU                  ||      }||jW                  |jY                                |j0                  d   }|j+                  d      du xr jZ                  du} |j+                  d      du xr j\                  du}!| j_                  | |!|||      | ja                         r	d|vrd|d<   | jc                  ||        jZ                  dz
  }"|j0                  d   |k7  r-|dk(  r(| j6                  j8                  s|"|j0                  d   z  }"| je                  ||||"|       jg                  |      }#|jh                  dkD  rtK        d      | j2                  jj                  |j2                  jj                  k7  r`tm        jn                  d|j2                  jj                   d| j2                  jj                   d| j2                  jj                   dtp               | js                  |||||j2                  ||	|
	      }$ | jt                  d=||d|}%jF                  |d<   |#tv        jx                  k(  rĉjz                  dkD  rtK        d jz                   d!      |dkD  rtK        d"      |d   stK        d#      j|                  d$v rtK        d%      | j~                  r"tK        d&| j                  j                         | j                  |||||||'      }& | j                  |f|&|$|%||d(|}'n|#tv        j                  k(  rk|stB        j                  d)       | j~                  r"tK        d*| j                  j                          | j                  |fj                  |$|%||d+|}'nK|#tv        j                  k(  rp|stB        j                  d,       |d   stK        d-      | j~                  r"tK        d.| j                  j                          | j                  |f|$|%||d/|}'n|#tv        j                  tv        j                  fv rR | j                  d=|jz                  | j6                  j8                  d0|\  }} | j                  |f|$|%||d/|}'nT|#tv        j                  tv        j                  fv rQ | j                  d=|jh                  | j6                  j8                  d0|\  }} | j                  |f|$|%|d1|}'n|#tv        j                  k(  rtB        j                  d2       t        |jh                  |j2                  j                  j                  jz                  j                  jZ                  3      }( | j                  d=|jh                  | j6                  j8                  d0|\  }} | j                  ||(f|$|%|d1|}'n|#tv        j                  k(  rtB        j                  d4       g })j                  j                  })j                  fd5}*t        j                  t              rt=        j                        d
k(  r |*        j                  D ]  }+t        |+d
   t              rct        |+t              rt=        |+      d
k(  r |*        t        d6 |+D              r |*        t        d7 |+D              r |*        t        |+      },nIt        |+t              rt=        |+      d
k(  r |*        t        d8 |+D              r |*        t        |+      },|)j                  |,        t        |)|jh                  |j2                  j                  j                  jz                  jZ                  9      }- | j                  d=|jh                  | j6                  j8                  d0|\  }} | j                  |f|-|$|%|d:|}'j                  du rAt        'd;      r5t        |'j                  d<      |'j                  j                         |'_e        'S c c}}w )>a  

        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config ([`~generation.GenerationConfig`], *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complements the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
                intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://huggingface.co/papers/2010.00904).
            synced_gpus (`bool`, *optional*):
                Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
                to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
                deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
            assistant_model (`PreTrainedModel`, *optional*):
                An assistant model that can be used to accelerate generation. The assistant model must have the exact
                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                is much faster than running generation with the model you're calling generate from. As such, the
                assistant model should be much smaller.
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
                size. This is an experimental feature, subject to breaking API changes in future versions.
            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Attention_mask for `negative_prompt_ids`.
            use_model_defaults (`bool`, *optional*):
                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
                generation configuration (`model.generation_config`), as opposed to the global defaults
                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
                `True`.
            custom_generate (`str`, *optional*):
                A string containing the name of a huggingface.co repository. If provided, the custom `generate`
                function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
                standard `generate` method. Note that the logic is for generation is entirely defined in that
                repository, and the return type may be different from the standard `generate` method.
            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateDecoderOnlyOutput`],
                    - [`~generation.GenerateBeamDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        r   N>   r   r   r  r   global_keys_to_excludemodelr  rk  r&   r   r  r   r  r   r   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.r   Tr   z1`attention_mask` passed to `generate` must be 2D.)r   r   r  rH  r   rq  r  )r   r  r  r   r  r  rd  zZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)	r   r  r  r  ri  r   r  r  r  )r   r  r  r8  zFnum_return_sequences has to be 1 when doing assisted generate, but is r  z6assisted generate is only supported for batch_size = 1z+assisted generate requires `use_cache=True`)r7  r0  r4  z=assisted generate is not supported with Static cache classes`zCassisted generation is not supported with stateful models, such as )r   r   r  rh  ri  rj  rk  r  )r}  ri  r  r   r  r  zDoLa Decoding is scheduled to be moved to a `custom_generate` repository in v4.55.0. To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call.z=dola decoding is not supported with stateful models, such as )dola_layersri  r  r   r  r  zContrastive Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call.z,Contrastive search requires `use_cache=True`zBcontrastive search is not supported with stateful models, such as )ri  r  r   r  r  r   rU  r   )ri  r  r   r  zGroup Beam Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call.)r   r  r   length_penaltydo_early_stoppingnum_beam_hyps_to_keepr  rq  zConstrained Beam Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call.c                  6    t        d j                   d      )Nzo`force_words_ids` has to either be a `list[list[list[int]]]` or `list[list[int]]` of positive integers, but is r  )r  force_words_idsr   s   rx   	typeerrorz+GenerationMixin.generate.<locals>.typeerror
  s*    $88I8Y8Y7ZZ[] rw   c              3   >   K   | ]  }t        |t                 y wr   )r   r  r;  	token_idss     rx   r>  z+GenerationMixin.generate.<locals>.<genexpr>
  s     Y9:i#>>Ys   c              3   @   K   | ]  }t        d  |D                yw)c              3   L   K   | ]  }t        |t               xs |d k    ywr   Nr   r   r;  token_ids     rx   r>  z5GenerationMixin.generate.<locals>.<genexpr>.<genexpr>
  s'     jT\Z#%>!>!N(Q,!Nj   "$N)r'  r  s     rx   r>  z+GenerationMixin.generate.<locals>.<genexpr>
  s$       )  j`ijjs   c              3   L   K   | ]  }t        |t               xs |d k    ywr  r  r  s     rx   r>  z+GenerationMixin.generate.<locals>.<genexpr>
  s'     hS[Jx$= = MA Mhr  )constraintsr   r  r   r  r  r  rq  )constrained_beam_scorerri  r  r   r  ra   to_legacy_cacherv   )gr   localsr   r   r   r  r  r  r  r   r   distget_world_sizerE   rZ   r   r   r   r   r   r   r   r  r  r   r   rt  r   r   r%  r   rq   r  r  r  r8  r1  r  rG  rT  ro  token_healingheal_tokensputr  rq  r  r  re  r  rb  get_generation_moder  r{  r  r  r  r  r  r9   ASSISTED_GENERATIONr\  r-  rL  r  rm   r~  _assisted_decodingDOLA_GENERATIONr  _dola_decodingr  CONTRASTIVE_SEARCH_contrastive_searchSAMPLEGREEDY_SEARCHr[  _sampleBEAM_SAMPLEBEAM_SEARCH_beam_searchGROUP_BEAM_SEARCHr*   r  early_stoppingr  _group_beam_searchCONSTRAINED_BEAM_SEARCHr  r  r   r  r'  r'   r(   r  r+   _constrained_beam_searchreturn_legacy_cacher   r   ra   r  ).r   r  r   ri  r  r  r  rh  r  r  r  r  r  r   r   r  r  r	  generate_argumentsr   r  rk  r  accepts_attention_maskrequires_attention_maskrf  r  r   r   r   r   r  r  r  rN  generation_modeprepared_logits_processorprepared_stopping_criteriar}  resultbeam_scorerfinal_constraintsr  word_ids
constraintr  s.     `                                           rx   r   zGenerationMixin.generate  s   ^ #JJ':DA&&" @Fx~~?O!ueSV^tSt#u*!u!u%%f-'@t'@'@(3D(HN($ ,M$M:LMM JJ{D1	$jj)>E*I$*I*I1+
5;+
'< 	##L$5$5$78  )=PQ 57W;QRV;Wv]a]p]p]ruv]vK/?/K+QdQf1B1N-ThTj!1S9J9J4<<9X9c9c9h9h9j5k!k"3<"G$0$4$45Et$LTX$X! 9=8R8R%22L9
5' #((+
%%$$%68QZ`$a {{-- "33?N++,1IImArE26G6Y6YYZ]^^l {{--2Bo2U*.'(-DI_-1-X-X0,.L)* ';.3|DT7U7[7[3\_`3` !TUU;;)).?|.SNN|-=?PL
 ;;))&*&T&T%!1)'8'T'T$++ 'U '#I| *:[)HlN^N^_jNkI**((I>ILL) %??1-!'L!9T!A!nFWFbFbjnFn!'L!9T!A!nFWFbFbjnFn ::/#9#9-'- ; 
 ((*/?|/S-.L)*''(9;KMcd -77!;"&66 O3KK22 3 3A 66**|_jJZ\b	

 ,??P%6%@%@1%Dl  ;;y//444MM@@I@P@P@U@U?V W++**+ ,TTXT_T_TdTdSe f*	*  %)$>$>/!1+%=- ''% 3+I %? 
%
! &AT%@%@ &
/CT`i&
ms&
"
 %6$?$?[! n@@@ 559 /DDEQH  A~ !YZZ, !NOO 559__ !`aa   !YZ^ZhZhZqZqYrs 
 #'"?"?"3#+ /!1!*$7) #@ 	# -T,,	$7!:"<"3'!	 	F  > >>$##w    STXTbTbTkTkSlm  )T((	-99!:"<"3'!	 	F  A AA$##w  , !OPP   XY]YgYgYpYpXqr  .T--!:"<"3'! F !6!68T8T UU&Hd&H&H '#-BB#';;#A#A' 	'#I| "T\\!:"<"3'! F !;!;^=W=W XX&Hd&H&H '#-77#';;#A#A' 	'#I| 'T&&!:"<"3' F  @ @@s
 +%+55$++0??"3"B"B&7&L&L 1 A A,77	K 'Id&H&H '#-77#';;#A#A' 	'#I| -T,, ";"<"3' F  F FFs !# ,,8$5$A$A! 00< ##4#D#DdK,<<=BK 1 A A 9H!(1+t4)(D9S]a=O%KYPXYY%K -5  &K%:8%D
)(D9S]a=O%Kh_ghh%K%6x%@
%,,Z8)9. 'B-%+55$++0??"3"B"B&7&L&L,77	'# 'Id&H&H '#-77#';;#A#A' 	'#I| 3T22(?!:"<"3' F 11T9 12..0ABN%+%;%;%K%K%MF"q "vs   nnthis_peer_finishedc                     |r_t        j                  |rdnd|      }t        j                  |t        j                  j
                         |j                         dk(  ryy|ryy)z
        Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
        fed through `this_peer_finished`. ZeRO stage 3-friendly.
        r  r  r  )opFT)rq   rj  r  
all_reduceReduceOpSUMrR  )r   r  r  r   this_peer_finished_flags        rx   _has_unfinished_sequencesz)GenerationMixin._has_unfinished_sequences
  s^    
  ',ll:L3RU^d&e#OO38I8IJ&++-4   rw   c                 `   t        d      j                  j                  }}t        j	                               }t        d|      }j                  |d      D cg c]  }|j                          }} |dd      j                  j                  |j                        }t        j                  ||k(  ||      }	 |j                         d	k(  r|S |ddd
f   j                         }	j                  j!                  d            d	   fd|	D        }
t#        t%        |	|
            D ]  \  }\  }}||   }t        j&                  ||k(        j)                         r5	 |j+                  |      D ci c]  }j!                  |      fd }}t-        |      dk(  ru||fxx   dz  cc<   |j/                  |       |dd
 }	 |j                         d	k(  rt-        |||k7           dk(  r||d
<   | j1                  |j3                  d	      |      ||<    |S c c}w c c}w )a  
        Generates sequences of token ids for models with a language modeling head.
        Parameters:
            input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation.
            tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids.
        Return:
            `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension.
        Nzs When generating with token healing, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.r&   )r  r)  T)skip_special_tokenspt)return_tensorspaddingr   r    c              3   `   K   | ]%  }j                  |      j                  d        ' yw)r  N)decodereplace)r;  t	space_tokr  s     rx   r>  z.GenerationMixin.heal_tokens.<locals>.<genexpr>  s)     SQY%%a(00i@Ss   +.)prefixg      $@r  r  r  )r  r  r)  r   	get_vocabr8   batch_decodestripr   r   r   rq   wherenumeltolistconvert_ids_to_tokensconvert_tokens_to_ids	enumerateziprQ  rR  
extensionsr   r   r   r`  )r   r   r  r  r)  
vocab_trier   r<  promptstail_ids	tail_toks	batch_idxtail_idtail_tok	batch_idsalt_tokseq_biastrimmed_idsr  s     `               @rx   r  zGenerationMixin.heal_tokens
  sr    * 
 &/%;%;Y=S=Sl#I$7$7$9:
,ALY '0&<&<Y\`&<&ab1779bb
 )BBy''(	 	 KK	\ 9<S		 ??!QU#**,33I4S4STW4XYZ[\	 T(S	.7Hi8P.Q "	p*I*!),Iyyl2388:
 R\QfQfnvQfQwFM009;TAH  8}! gZ C' $$8$<#CR.K   "a' 9Y,678A=".B#'==1F1Fq1I]n=#oIi E"	pH y cDs   #H&?H+r  c           
      
   | j                   j                  rt        d      |j                  }	|j                  }
|j
                  }|j                  }|j                  }|j                  }t        d |D              }|j                  }|r|rdnd}|r|rdnd}|r|
rdnd}|r|
rdnd}|r|rdnd}|j                  dd \  }}t        j                  |t        j                  |j                        }| j!                  ||j                  |      }d}| j                   j#                         j$                  }| j                   j&                  sd}n|dkD  rd}n
|dk(  rd	}nd}t)        |t*              rF|d
k(  rA||dz  k(  r|g}n|dk  rt-        t/        ||dz  d            nt-        t/        |dd            }n~t)        |t*              r=|dk(  r8|dk  rt-        t/        |dz  |d            nt-        t/        |dz
  |d            }n1t)        |t,              r|D cg c]
  }||k  s	| }}nt        d      | j1                         }|t        d      | j3                  |||j                        r | j4                  |fi |} | di |d|
dd} | j6                  dddddf   j9                         j;                  dt        j<                        }!| j6                  dddddf   j?                         }"i }#|D ]=  }$ || j@                  |$   dddddf         j;                  |"j                        |#|$<   ? | jC                  | || j                   j                        }|r|rtE        ||#|"      }%|%j;                  |j                        }% |||%      }&|r|r||&fz  }|r||!fz  }|
rY|| j                   j                  r| jF                  fn| jH                  fz  }| j                   j                  r|| jJ                  fz  }|r3|| j                   j                  r| jL                  fn| j@                  fz  }|rHtN        jP                  jS                  |&d      }'t        jT                  |'d	      jW                  d	      }(nt        jX                  |&d      }(|r|(|z  |	d	|z
  z  z   }(t        jZ                  ||(dddf   gd      }||j]                  |(j_                                | |||       z  }|ja                         dk(  }| j3                  |||j                        r||jc                          |r te        ||||||jg                  d            S |S c c}w )a  
        Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
        used for decoder-only text models.
        The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
        Models" (https://huggingface.co/papers/2309.03883) in ICLR 2024.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            dola_layers (`Union[str, list[int]]`):
                The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
                means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
                to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        z8DoLa decoding is only available for decoder-only models.c              3   4   K   | ]  }t        |d         ywr*  Nr   r;  r  s     rx   r>  z1GenerationMixin._dola_decoding.<locals>.<genexpr>~       'lh.(I'l   rv   Nr   r   Fr   r&   low(   r  highz?dola_layers must be either 'low', 'high' or a list of integers.zCDoLa is not supported for models that don't have output embeddings.r  T)rA  r?  r@  r   )r  r   r   rL  num_samplesra   rh   ri   rj   rk   rl   ra   )4r   r   r  r%  r?  r@  output_scoresoutput_logitsreturn_dict_in_generater'  rv  r   rq   r   r   r   r  rx  r  tie_word_embeddingsr   strr  r  get_output_embeddingsr  r
  rj   detachr   float32floatrl   rg  _dola_select_contrastr}   rk   r~   r   r   r   softmaxmultinomialsqueezeargmaxr   r  r  r   endrg   r   ))r   r   r  ri  r  r   r  r  r  r)  r?  r@  r  r  r  has_eos_stopping_criteriarv  ri   
raw_logitsr}   r~   r   r   
cur_lengthunfinished_sequencesr  final_layerstart_layercandidate_premature_layersilm_headr   r\  final_layer_next_token_logitsfinal_logitscandidate_premature_logitscandidate_premature_layernext_token_logitsnext_token_scoresprobsnext_tokenss)                                            rx   r  zGenerationMixin._dola_decodingB  s   d ;;))WXX )::-??0EE)77)77"3"K"K$''lZk'l$l!%//	 0M3RD
$;@QRX\"9>O2VZ'>CW^b "+!!4
J$zz*EJJyO_O_`77
IDTDTVbc" kk113EE
 {{..K1_KAKK k3'K5,@kQ...9]* #b( {K1,<a@AeKQ78 +
 S)kV.C "$ U;!+[!<=%b 0+qAB ' T*5@)TAO!)T&)T^__,,.?bcc,,-?U^UeUe,f=4==iX<XL   "3%)	G -4NN1b!8,D,K,K,M,P,PVZbgbobo,P,p)">>!R(399;L)+&-G *)HO))*CDQAXNI"\(() ++DE*  CC#';;#A#A D L
 1 5*,F! !2 4 4Y5E5E F 0<M N ' 022F #@"BBJ$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) --.?R-H#//1EMMaP#ll+<"E )),@@<STWkSkCll 		9k!T'.B"CLI#[__./ $8;LYX^;_:_#_ !5!9!9!;q!@U ,,-?U^UeUe,fX LLN",#!-3 , 0 01B C  C *Us   
U%"U%c                 P   t        d |D              }|j                  }	|j                  }
|j                  }|j                  }|j
                  }|j                  }|j                  }|j                  }|j                  }|r|rdnd}|r|rdnd}|r|rdnd}|r|rdnd}|r|rdnd}|rF| j                  j                  r0|r|d   j                  d      nd}|r|d   j                  d      nd}|j                  dd \  }}t        j                  |t        j                   |j"                        }| j%                  ||j"                  |      }t        j&                  |t        j                   	      }| j                  j                  rd
|v r|d
   |d
   }n|d   }|j)                  |	d      }d}| j+                  |||j"                        	r|j                  d      1t-        |d   t.        t0        f      r|d   j3                         dk(  rd|d<    | j4                  |fi |} | di |dd|d}| j                  j                  r|j6                  d   } n|j8                  d   } |j:                  dddddf   j=                  dt        j>                  |j"                        }!| jA                  ||| j                  j                        }|s- | jB                  d||	| j                  j                  d|\  }"}|j                  d      }#|#"tE        | jF                  jH                   d      t-        |#d   tJ        t        jL                  f      r|#d   d   j                  d   |k7  r"tE        | jF                  jH                   d       ||!      }$tN        jP                  jS                  |$d      }%t        jT                  |%d|	      \  }&}'|r|r||!fz  }|r||$fz  }|rY|| j                  j                  rjV                  fnjX                  fz  }| j                  j                  r||jZ                  fz  }|r3|| j                  j                  rj6                  fnj8                  fz  }~|s|d   }(t-        |(t\              s*t-        |(t0              r,t-        |(j^                  t\              r|(ja                  |	       nYg })|(D ]G  }*g }+|*D ]$  },|+jc                  |,j)                  |	d             & |)jc                  tK        |+             I tK        |)      }(|(|d<   |rg }-te        |	      D ]  }. | j4                  |'dd|.f   jg                  dd      fi |}/ | di |/dd|d}t-        |d   t\              s0t-        |d   t0              r6t-        |d   j^                  t\              rd|d<   |d   ji                  d       |-jc                  |        tk        |-| j                  jm                               }n1 | j4                  |'jg                  dd      fi |}/ | di |/dd|d}~/| j                  j                  r|j6                  d   }0|j6                  }1n|j8                  d   }0|j8                  }1|j:                  dddddf   jo                         }2 j)                  |	d      }3tq        |3|0|&||
|	      }4t        jr                  ||ju                  |j                  d   df      gd      }|4j=                  d      }4t        jv                  ty        |4      D .5cg c]  \  }.}5|5|.|	z  z    c}5}.      }6|'te        t{        |'            |4f   }7t        j|                  t        j~                  |0j                  d      |	            }0|0te        |      |4ddf   }0t        jr                  | |0j                  d      gd      } d}8|1D ]B  }*t        j|                  t        j~                  |*|	            te        |      |4ddf   }*|8|*fz  }8D |r> | j4                  |'dd|4f   jg                  dd      fi |}9 | di |9dddd}:|:d   };nd};t        D ]  }<|;xs t        ||<d      }; t-        |;t\              s*t-        |;t0              r,t-        |;j^                  t\              r|;j                  |6       nMg })|;D ];  }*g }+|*D ]  },|+jc                  |,|6df           |)jc                  tK        |+             = tK        |)      };t        j|                  t        j~                  |2|	            te        |      |4ddf   }!|!j=                  |j"                        }!| j                  j                  rd}=d}>|r|jZ                  D ]B  }*t        j|                  t        j~                  |*|	d            te        |      |4df   }*|=|*fz  }=D |jV                  D ]B  }*t        j|                  t        j~                  |*|	d            te        |      |4df   }*|>|*fz  }>D t        |;|8|>xs d|=xs d      }ngd}?|rQ|jX                  D ]B  }*t        j|                  t        j~                  |*|	d            te        |      |4df   }*|?|*fz  }?D t        |;|8|?xs d      }| jA                  ||| j                  j                        }|r|r	.|r|7|z  |d|z
  z  z   }7t        jr                  ||7dddf   gd      }||j                  |7j                                | |||       z  }|j                         dk(  }| j+                  |||j"                        r	||j                          |r|j                  d      t-        |d   t\              s0t-        |d   t0              r2t-        |d   j^                  t\              r|d   ji                  d       nXg }#|d   D ]@  }*g }@|*D ]  },@jc                  |,dddddf           |#jc                  tK        @             B tK        |#      |d<   | j                  j                  r#t        |||||||j                  d       	      S t        ||||||j                  d      !      S |S c c}5}.w )"a  
        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c              3   4   K   | ]  }t        |d         ywr  r  r  s     rx   r>  z6GenerationMixin._contrastive_search.<locals>.<genexpr>B  r  r  rv   Nr  rk   rl   r   r   r   r   r   r   rL  Fr  ra   Tr8  )rA  r@  r?  r   r  r   r   r  r  zQ does not support caching and therefore **can't** be used for contrastive search.z| does not have a standard cache format and therefore **can't** be used for contrastive search without further modifications.)rM  r  r&   r  .)ra   r   r}   r~   )ra   rl   rk   	rh   ri   rj   r{   r|   r}   r~   r   ra   r  )Mr'  r  penalty_alphar%  r?  r@  r  r  r  
low_memoryr   r   r   r   rq   r   r   r   r  rS  rX  r  r   r   r   r  r
  r   rl   rj   r   r  rg  r[  r  r  rm   rt   r  r   r   r  topkr}   rk   r~   r   r:  batch_repeat_interleaver  r  rN  cropstack_model_outputsrx  r  _ranking_fastr   ra  rj  r  r   r  splitr  r`  r_  r   batch_select_indicesr   r   r  r  r   r
  rz   rg   )Ar   r   ri  r  r   r  r  r  r  r  r   r)  r?  r@  r  r  r  
sequentialr  ri   r}   r~   r   r{   r|   r   cur_lenr  cosine_matrix_maskr  r   r\  last_hidden_stateslogit_for_next_stepr   ra   processed_logit_for_next_step
next_probstop_k_probs	top_k_idspastnew_key_valuesr+  r   rR  all_outputsr  next_model_inputsnext_hiddenfull_hidden_statesrj   context_hiddenselected_idxr  augmented_idxr  next_decoder_hidden_statesnext_model_inputselected_outputsnext_past_key_valuesrc  next_step_cross_attentionsnext_step_decoder_attentionsnext_step_attentionslayer_past_key_valuessA                                                                    rx   r  z#GenerationMixin._contrastive_search  s   X %('lZk'l$l!!'')77(::-??0EE)77)77"3"K"K&11
 4RD
/M$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G$zz*EJJyO_O_`77AQAQS_` #__YejjI;;))'<7LIa<b<n%12J%K"!-.>!?/AA%QAO",,-?U^UeUe,f  12:<(9:UDW<XY !23BBDI -1[)AtAA)\|\  "044ct ;;11)0)F)Fr)J&)0)>)>r)B& '.nnQAX&>&A&AU]]9;K;K 'B '#  $GG '+{{'E'E  H   " 'Id&H&H '"+$)+/;;+I+I' '	'OA| #/"2"23D"E"*$>>223 42 2 
 #?1#5u||7LM&q)!,2215C$>>223 4U U  -=YH[,\)../LRT.UJ%*ZZ
e%L"K ' #6"88J <>>F$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) #$56dL1t%89jIbIbdp>q007%'N!% < "$) OD!LL)?)?1)?)MNO&--eEl;< !0D26./ u 0A(J(J(J9UVXYUY?K_K_`bdeKf(wjv(w%" +$(-1*;	G "'*;"<lK"7+<#=?RS&w/@'A'V'VXde 6: 12$%67<<R@&&w/'0( .k4;;;V;V;XY
 %GD$F$Fy~~VXZ[G\$m`l$m! ' $)-&7	 " {{--%;;B?%,%B%B"%33B7%,%:%:" ^^Ar1H-335F/AA%QAON
 )[:Lm]bL "'#%7%@%@BTBZBZ[\B]_`Aa%bcik" (??51L "LLIlD[)\DAq!a%i-)\]M
 $E#i.$9<$GHK++ekk+2E2E!2E2Le&TUK%eJ&7q&HIK!&,>@U@UVW@X+Y_`!a)+&+ 7EKKu$=>uZ?PR^`a?ab*uh6*7
 #E4#E#Eao.33B:$>J$  $( $&$ $).&+	$  (88I'J$ (,$+: o'+?+n77TgimCn(o 2LA35HI"#7#L#Ll[(==mL%'N!5 < "$) CD!LLmS.@)ABC&--eEl;< ,1+@("'++ekk&%.H"I%PZJ[]iklJl"m"5"8"89I9I"J {{---/*/1,$!(!9!9 ? %EKKu!,L MeT^N_amorNr s2uh>2? ")!;!; A %EKKu!,L MeT^N_amorNr s4@4A *$8*D'C'Kt%?%G4	 (*$$!(!3!3 9 %EKKu!,L MeT^N_amorNr s,8,9 1$8"<3;t  CC#';;#A#A D L
 1 )),@@<STWkSkCll 		9k!T'.B"CLI#[__./ $8;LYX^;_:_#_ !5!9!9!;q!@w ,,-?U^UeUe,fz LLN"  12>l+<=|L|,=>@ST"<0A#B#W#WYef !2388<&(O!-.?!@ M02-$) LD188c3B3k9JKL'..u5J/KL	M
 7<O6LL!23{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  i *]s   9r"
c                 l
   |j                   }|j                  }	|j                  }
|j                  }|j                  }|j
                  }t        d |D              }|j                  }|r|rdnd}|r|rdnd}|r|	rdnd}|r|	rdnd}|r|
rdnd}|rF| j                  j                  r0|	r|d   j                  d      nd}|
r|d   j                  d      nd}|j                  dd \  }}d}t        j                  |t        j                  |j                  	      }| j!                  ||j                  |      }| j"                  }| j%                  ||      }|rd
t&        j(                  d<   | j                  j*                  dk(  rvt-        |j                  d      dd      rZ|j.                  t1        d      |_        n<|j.                  j2                  r&t4        j7                  d       d|j.                  _        | j9                  |j.                        }|j:                   | j<                  ||fi |}d}nd}| j?                  |||j                        rn | j@                  |fi |}|jC                  |	rd|	ini        |jC                  |
rd|
ini        |r | di |ddi}d}n |di |ddi}| jE                  ||| j                  j                        }|r|r|jF                  dddddf   jI                  dt        jJ                  |j                        }  |||       }!|r|r||!fz  }|r|| fz  }|	rY|| j                  j                  r|jL                  fn|jN                  fz  }| j                  j                  r||jP                  fz  }|
r3|| j                  j                  r|jR                  fn|jT                  fz  }|rHtV        jX                  j[                  |!d      }"t        j\                  |"d      j_                  d      }#nt        j`                  |!d      }#|r|#|z  |d|z
  z  z   }#t        jb                  ||#dddf   gd      }||je                  |#jg                                | |||       z  }|ji                         dk(  }|dz  }~| j?                  |||j                        rn||jk                          |rY| j                  j                  r#tm        |||||||j                  d      	      S to        ||||||j                  d            S |S )a  
        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c              3   4   K   | ]  }t        |d         ywr  r  r  s     rx   r>  z*GenerationMixin._sample.<locals>.<genexpr>  r  r  rv   Nr  rk   rl   r   Fr   0TOKENIZERS_PARALLELISMflash_attention_2ra   r   )	fullgraphzWhen using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as FA2 introduces graph breaks. We overrode the option with `fullgraph=False`.Tr  r?  r@  rA  r  r   r  rL  r&   r  r   r  r  )8r%  r?  r@  r  r  r  r'  rv  r   r   r   r   rq   r   r   r   r  __call__r  r   environr   r   rz  r7   rH  r  r  get_compiled_callprefill_chunk_size_prefill_chunkingr  r
  r   rg  rj   r   r  r}   rk   r~   r   rl   r   r   r  r  r  r	  r   r  r  r   r
  rz   rg   )$r   r   ri  r  r   r  r  r  r)  r?  r@  r  r  r  r  rv  ri   r  r}   r~   r   r{   r|   r   r*  r  r  model_forwardcompile_forward
is_prefillr   r\  r  r  r  r  s$                                       rx   r  zGenerationMixin._sample  s   V )::-??0EE)77)77"3"K"K$''lZk'l$l!%//	 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G"$zz*EJJyO_O_`77AQAQS_`;;LJ[\36BJJ/0{{//3FF7  !235EuL %33;7Du7U%4&55??''f BG%44> 223D3S3STM//;1411)=N_R^_LJJ,,-?U^UeUe,f=4==iX<XL L]!46G HcefRf!79M Nlno@@4@"
'I,IDI  CC#';;#A#A D L
 1 !(q"ax 8 ; ;U]]clcscs ; t !1<M N ' 022F #4"66J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) --.?R-H#//1EMMaP#ll+<"E )),@@<STWkSkCll 		9k!T'.B"CLI#[__./#7;LYX^;_:_#_ !5!9!9!;q!@qLG S ,,-?U^UeUe,fV LLN"{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  rw   rj  c                 x    t        | j                        }t        j                  | |d   |d   z  g|dd z         S )z=[batch_size, num_beams, ...] -> [batch_size * num_beams, ...]r   r&   r   Nr  r   rq   r  )rj  r   s     rx   _flatten_beam_dimz!GenerationMixin._flatten_beam_dim}  s>     V\\"}}VeAhq&9%:U12Y%FGGrw   r  c                 h    t        | j                        }t        j                  | ||g|dd z         S )z=[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]r&   NrR  )rj  r   r  r   s       rx   _unflatten_beam_dimz#GenerationMixin._unflatten_beam_dim  s3     V\\"}}Vj)%<uQRy%HIIrw   c                    t        |j                        t        | j                        k  r=|j                  d      }t        |j                        t        | j                        k  r=t        j                  | |d      }|S )a  
        Gathers the beam slices indexed by beam_indices into new beam array.

        Args:
            tensor (`torch.Tensor`): A tensor containing data to be gathered. The tensor is a 2D or a 3D tensor
                with the two first dimensions depicting the batch and the beam dimensions.
            beam_indices (`torch.Tensor` of shape `(batch_size, num_beams_to_select)`): The indices of the beams to
                select .

        Returns:
            A tensor with the selected beams
        r   r&   )inputr  rM  )r   r   r`  rq   take_along_dim)rj  r   gathered_tensors      rx   _gather_beamszGenerationMixin._gather_beams  si     ,$$%FLL(99'11"5L ,$$%FLL(99..V\WXYrw   #is_early_stop_heuristic_unsatisfiedrunning_beam_scoresbeam_scoresis_sent_finishedr*  rq  decoder_prompt_lenr  r  c	                     |dk(  r|dkD  r||z
  }	n||z
  }	|ddddf   |	|z  z  }
t        j                  |t        j                  |dd      d   d      }| t        j                  |
|kD  d	d      z  S )
uH  
        Determine whether early stopping is possible by checking if the best possible score of running beams
        could still improve upon the finished ones.

        Mechanism:
        - Without a length penalty, beam scores typically decrease as more tokens are generated.
        So, if the *best possible* score from any running beam is already worse than the *worst* finished beam,
        we can safely stop early.
        - With a length penalty, scores may increase with longer sequences. In this case, we use heuristics
        to estimate the best possible score — though this estimate may not always be correct — and stop
        if no further improvement seems likely.

        We apply different heuristics depending on the value of `early_stopping`:
        1. `early_stopping == False`:
        -> Use a heuristic that assumes the best score comes from the current length minus the decoder prompt length.
        -> See detailed discussion: https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565

        2. `early_stopping == "never"`:
        -> Estimate the best score using either `max_length` or `cur_len`, depending on the sign of `length_penalty`.
        -> A positive length penalty favors longer sequences, so we use `max_length` in that case.

        NOTE: the canonical beam search implementation can be replicated with `early_stopping="never"` and
        `length_penalty=0.0`, which are NOT the default flags. The default behavior was empirically found to produce
        better sequences (prior to 2022), and changing it is BC breaking.
        neverr  Nr&   TrM  keepdimr       er   )rq   r  r  r'  )r[  r\  r]  r^  r*  rq  r_  r  r  best_hypothetical_lengthbest_possible_running_scoreworst_finished_scores               rx   _check_early_stop_heuristicz+GenerationMixin._check_early_stop_heuristic  s    J W$#)='14F'F$'.1C'C$&9!RaR%&@D\^lDl&m#${{+;UYY{XYcg=hij=kmst2UYY'*>>BPT6
 
 	
rw   !next_token_hits_stopping_criteriac                     t        j                  |       }t        j                  |      |du z   }t        j                  |       }||z  |z  S )zv
        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
        T)rq   r'  rQ  )r[  r^  ri  r  improvement_possibleexists_open_beamvalid_continuationss          rx   %_beam_search_has_unfinished_sequencesz5GenerationMixin._beam_search_has_unfinished_sequences  sZ      %yy)LM #YY'78Nd<RST  %yy)JKK#&669LLLrw   accumulated_log_probsrunning_sequencesrunning_beam_indicesrv  beams_to_keepry  c                    |rOt        j                  t        j                  j	                  |d      |      }t        j
                  |d|      }nt        j                  ||      \  }}||	z  }| j                  ||      }| j                  ||      }||	z  }||dddd|f<   t        j                  |
|j                        j                  dd      |z  }||z   }||dddd||z
  f<   |||fS )	a'  
        Get top-K continuations given the accumulated log probs on the next token.

        A few notes to understand what's going on:
        1. Each item in batch has `num_beams` * `vocab_size` candidate continuations. For each item, get the
        top K [K = (number of EOS tokens + 1) * `num_beams`] candidates with the highest accumulated
        log-probabilities, or sample them without replacement using the accumulated scores
        2. We gather the top K (as opposed to `num_beams`, or any number lower than K) here so that we have at
        least `num_beams` sequences remaining to continue the live beam search.
        3. Note that other stopping criteria might result in impossible to continue beams, i.e. all continuations
        selected in this step hit the stopping criteria.
        r   rL  r  r&   )rW  rM  indexr  Nr  )rq   r  r   r   r  r  r"  rZ  rb  r   rN  )r   ro  rp  rq  r*  r_  rv  rr  r  ry  r   topk_indicestopk_log_probstopk_current_beam_indicestopk_running_beam_indicestopk_running_sequencestopk_idsbatch_offsetbatch_modified_indicess                      rx   _get_top_k_continuationsz(GenerationMixin._get_top_k_continuations  s   <  ,,%%&;%DR_L #\\0E1T`aN+0::6K}+](NL %1J$>!$($6$67KMf$g!!%!3!34EG`!a*, 19q!W}- ||JxGLLRQRSV__!:\!IH^!!Q2D(D"DE57PPPrw   rw  rz  ry  c                     ||j                  t        j                        dz  z   }t        j                  ||      d   }| j	                  ||      }| j	                  ||      }	| j	                  ||      }
||	|
fS )z
        Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
        best non-finished beams to continue beam search in the next iteration.
        rd  ru  r&   )r   rq   r  r"  rZ  )r   rw  rz  ry  ri  r  topk_running_log_probsnext_topk_indicesrp  r\  rq  s              rx   %_get_running_beams_for_next_iterationz5GenerationMixin._get_running_beams_for_next_iteration  s     "02S2V2VW\WdWd2ehn2n!n!JJ'=KAN ../EGXY"001GIZ[#112KM^_ "57KKKrw   top_num_beam_maskc                    |	|
dddf   z  }||dz   |z
  |z  z  }t        j                  |dd      |du z  }||j                  t         j                        dz  z  }|| j                  t         j                        dz  z  }|| dz  z  }t        j                  ||fd      }t        j                  ||fd      }t        j                  ||fd      }t        j                  ||fd      }t        j
                  ||      d   }| j                  ||      }| j                  ||      }| j                  ||      }| j                  ||      }||||fS )	z
        Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
        the current finished sequences.
        Nr&   r   T)axiskeepdimsrd  rL  ru  )rq   rQ  r   r  r   r"  rZ  )r   rh   rz  r]  rw  r   ry  r[  r^  ri  r  r  r*  r_  r  r  did_top_num_beams_just_finishedbeams_in_batch_are_fullmerged_sequencesmerged_scoresmerged_beam_indicesmerged_is_sent_finishedtopk_merged_indicess                          rx   _update_finished_beamsz&GenerationMixin._update_finished_beams/  s   2 +LN_`dfg`gNh*h' (GaK:L,LQ_+_`"')),<2PT"UYgkoYo"p144U]]CfLL??CCEMMRU[[[ 	;;vEE
 !99i1G%HaP		;"?QG#ii7P(QWXY"'))-=?^,_ef"g#jj)DQG&&'79LM	((8KL))*=?RS--.EGZ[+|5EEErw   c                    |j                   }|j                  }|j                  }	|j                  }
|j                  }|j
                  }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  dd \  }}||z  }| j                  j                  dk(  r| j                   j"                  }nX| j                  j                  dk(  r| j%                         j&                  }n$| j                   j)                         j*                  }|}d}||j                  d   nd}t-        dd|z         |z  }t/        j0                  t/        j2                  |t.        j4                        t/        j6                  ||z
  t.        j4                        fd	      j9                  |j:                        }| j=                  ||j:                  |      }|j>                  }|rtA        d
      |r|rdnd}|r|rdnd}|r|rdnd} |r|	rdnd}!|r|	rdnd}"|r|
rdnd}#|rF| j                   jB                  r0|	r|d   jE                  d      nd}$|
r|d   jE                  d      nd}%|	|xs |d   nd}&t/        jF                  |||f|&t.        jH                  |j:                        }'| jK                  |||      |'ddddd|f<   |'jM                         jO                         }(t/        j6                  ||ft.        jP                  |j:                        })d|)ddddf<   t/        jF                  ||fdt.        jP                  |j:                        }*t/        j6                  ||ft.        j4                  |j:                        }+t/        j2                  |dft.        j4                  |j:                        },t/        j6                  ||ft.        j4                  |j:                        }-t/        jF                  ||||z
  fdt.        jR                  |j:                        }.|.jM                         jO                         } | jU                  |||j:                        rO| jW                  |'ddddd|f         }/ | jX                  |/fi |}0|0j[                  |	rd|	ini        |0j[                  |
rd|
ini         | di |0ddi}1| j]                  |1|| j                   jB                        }|r|r|1j^                  dddddf   j9                  dt.        j`                  |j:                        }2tb        jd                  jg                  |2d	      }3 ||/|3      }3|r|r||2jO                         fz  }|r|r||3jO                         fz  }|	rY|!| j                   jB                  r|1jh                  fn|1jj                  fz  }!| j                   jB                  r|"|1jl                  fz  }"|
r3|#| j                   jB                  r|1jn                  fn|1jp                  fz  }#~1| jK                  |3||      }3|3|)dddddf   z   }3t/        jr                  |3|||z  f      }3| ju                  |3|'|.|||||||
      \  }4}5}6 || jW                  |5ddddd|dz   f         |      }-| jK                  |-||      }-| jw                  |4|5|6|-|      \  }'})}.| jy                  |(|5|*|4| |6|,|+|-||||||      \  }(}*} }+|jE                  dd      R| jW                  |.d||z
  f         }7t{        | d      r| j}                  |d   |7      |d<   n|d   j                  |7       |dz   }| j                  |,|)|*|+||||| 	      },| j                  |,|+|-|       }| jU                  |||j:                        rO| jW                  |(ddd|ddf         }(| jW                  |*ddd|f         }*| jW                  | ddd|ddf         } | dz   j5                         j                  d	      j-                         }8||8z   }9|(ddd|9f   }(| ddd|8f   } |ra|sd}*| j                   jB                  r%t        |(|*||| $%|!|"|#|jE                  d      !      S t        |(|*||| |!|#|jE                  d      "      S |(S )#a	  
        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        If it's the first time you're diving into Beam Search, we recommend you read the following blog post:
        https://huggingface.co/blog/how-to-generate (especially the beam search section).

        You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function
        (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores)

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`:
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        Nr   MoshiDepthDecoderImageGPTForCausalImageModelingFr   r&   r   rL  z`low_memory=True` is not supported after the beam search refactor. Please check the discussion in #35802 *after the PR got merged*, and add a comment there if your questions are not yet answered.rv   r  rk   rl   r   )
fill_valuer   r   r   rd  r  r?  r@  rA  Tr  r  )
ro  rp  rq  r*  r_  rv  rr  r  ry  r   )rw  rz  ry  ri  r  )rh   rz  r]  rw  r   ry  r[  r^  ri  r  r  r*  r_  r  r  ra   ._reorder_cache)	r[  r\  r]  r^  r*  rq  r_  r  r  rh   r   ri   rj   r   r{   r|   r}   r~   r   ra   rh   r   ri   rj   r   rk   rl   ra   )Er%  r&  r?  r@  r  r  r  rv  r  r  rq  r  r\  r   r  rm   r   audio_vocab_sizer  out_featuresrx  ry  r   rq   r   r   ry  r   r   r   r  r!  r  r   r   fullr  rU  r  r   r  r   r  rS  r
  r   rg  rj   r  r   r   r  r}   rk   r~   r   rl   r  r~  r  r  r   r  reorder_cacherh  rn  r  r   r   ):r   r   ri  r  r   r  r  r)  r*  r?  r@  r  r  r  rv  r  r  rq  r  r\  batch_size_unflattenedr*  r   ry  r_  r  n_eos_tokensrr  r  r)  
all_scoresr  r   r}   r~   r   r{   r|   output_fill_valuerp  rh   r\  r]  r^  r[  ri  rq  flat_running_sequencesr   model_outputsrj   	log_probsrw  rz  ry  beam_idxmax_generated_lengthoutput_lengths:                                                             rx   r  zGenerationMixin._beam_searchf  s	   \ )::(::-??0EE)77)77"3"K"K%//	*99*99&11
%//	0EE*3//"1*='+y8
>>""&9955J^^$$(HH335BBJ446AAJ$" 1=0H|))!,aAq</09<!IIZZ5::6]U^E^glgqgq8rs
 "Y
 	
 77AQAQS_` '11
t  4RD
3RD
5-rd$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " @L?WL;LO]_!JJJ/(++##	
 -1,D,DYPZ\e,f!Q.)%,,.446	
 $kk:y*A]f]m]mn%)AqrE"jj*i!8TQVQ\Q\eneueuv !;;
I'>ejjYbYiYij /4jj*aPUPZPZclcscs.t+ -2KK#5::i>N>N-
)
  %zzJ$89bPUP[P[dmdtdt 
 ,224::< ,,-?U^UeUe,f%)%;%;<MaQRT\U\T\n<]%^"=4==>TeXdeL L]!46G HcefRf!79M Nlno B<BTBM  CC#';;#A#A D L
 1 #))!R(366D^g^n^n6oF 11&b1AI()?KI ' 6<<>"33J*}9??#4"66J$&;;99 '99;+668&
 {{55(]-K-K,MM(');;99 '<<>+99;) 00J	RI!$71d
$CCIi*i*>T1UVI QUPmPm&/"3%9#5#+#%% Qn QMN24M 1B&&'=aMgPQkM>Q'RS1- 150H0H1:}1-
 LPKuKu-'=*C2S# Lv LH24H FJE`E`#'='-)*C4W!12S"3##5-- Fa FBI{L2B.  148D112FsGVhLhGh2ij4!126:6I6I,WhJiks6tL!23 !23AA(KkG262R2R4W$7'!1%#5-- 3S 
3/ &*%O%O3 1	& "O ,,-?U^UeUe,f` **9Q8M9M8Mq5P+QR	,,[<Q=Q<Q9Q-RS--l1>S?S>SUV;V.WX ".!1 7 7 9>>1>EIIK*-AAa-/0	#A'<(<'<$<=" "{{--7'%0%%!-'9*?'9%5*?$0$4$45F$G  5'%0%%!-1"7$0$4$45F$G	 	 rw   r  c                 X  :;< |j                   }|j                  }	|j                  }
|j                  }|j                  }|j
                  }|j                  }|j                  }|j                  }||z  }t        |j                        |z  }|j                  }|j                  \  }}| j                  ||j                  |      }|r8|r6t        |      D cg c]   }t        d t        ||z        D              " c}<nd<||z  |k7  rt!        d||z   d| d      |r|rdnd}|r|rdnd}|r|
rdnd}|r|
rdnd}|r|rdnd}|rF| j"                  j$                  r0|
r|d   j'                  d      nd}|r|d   j'                  d	      nd}t)        j*                  ||fd
t(        j,                  |      }d|dddd|f<   |j/                  ||z  f      }d}|j                  d   } | j1                  |||j                        rt)        j2                  ||z  |j4                  |      }!t)        j2                  ||z  t(        j6                  |      }" | j8                  |fi |}#|#j;                  |
rd|
ini        |#j;                  |rd|ini         | di |#ddi}$| j=                  |$|| j"                  j$                        }|r|r|dz   }|r)t)        j>                  |$j@                  dddddf         }%|r1|$j@                  dddddf   jC                  d|j                        }&t        |      D ]T  ::|z  }'tE        |'|z   |      }(|(|'z
  })g }*t        |      D ]1  }+|*jG                  t        |'|(      D ,cg c]
  },|+|z  |,z    c},       3 ||*   }-|$j@                  |*dddf   jC                  t(        jH                  |j                        }.tJ        jL                  jO                  |.d      }/|/j                  d   }0 ||-|/|!:      }1|1||*   jQ                  d      z   }/|/jS                  |1      }/|r|1%|*<   |/j/                  ||)|0z        }/|	|	j                  d   nd}2t)        jT                  |/tW        dd|2z         |)z  ddd      \  }/}3t)        jX                  |3|0d      }4|3|0z  }3<t[        <d      nd}5|j]                  |-|/|3|4||	|5:| 	      }6|6d   ||*<   |6d   }7|6d    ;|r0|r.t        :;<fd!t        t        <d               D              <:<   |-;   ||*<   t)        j^                  |-;ddf   |7jQ                  d      gd      }-|-dddf   |!|*<   |t)        jX                  ;|)d      z  |'z   ;|)z  z   |"|*<   W |r|r|%fz  }|r|&fz  }|
rY|| j"                  j$                  r|$j`                  fn|$jb                  fz  }| j"                  j$                  r||$jd                  fz  }|r3|| j"                  j$                  r|$jf                  fn|$jh                  fz  }t)        j^                  ||!jQ                  d      gd      }~$|j'                  d"d      9tk        | d#      r| jm                  |d"   |"      |d"<   n|d"   jo                  |"       |dz   }|jp                  sts         |||            rd}| j1                  |||j                        r<t[        <d      nd}8|ju                  ||34||	|jv                  |8| $	      }9|rv|sd|9d%<   | j"                  j$                  r.ty        |9d&   |9d%   |||9d'   ||||j'                  d"      (      S t{        |9d&   |9d%   |||9d'   |||j'                  d"      )      S |9d&   S c c}w c c},w )*a	  
        Generates sequences of token ids for models with a language modeling head using **diverse beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c              3       K   | ]  }d   ywrv   Nrv   r;  r   s     rx   r>  z5GenerationMixin._group_beam_search.<locals>.<genexpr>  s     !P"!P   N)Batch dimension of `input_ids` should be 	, but is r  rv   r  rk   rl   rd  r   r   Fr&   r  r?  r@  rA  Tr  r   )r  r   rL  )current_tokensbeam_group_idxr   rM  largestsortedfloor)rounding_mode)r)  r*  r   group_indexr_  next_beam_scoresnext_beam_tokensnext_beam_indicesc              3   B   K   | ]  }   |      |   fz     y wr   rv   )r;  r  r  r  r   s     rx   r>  z5GenerationMixin._group_beam_search.<locals>.<genexpr>u  s/      9WX^4Xa[AXa[NR9s   ra   r  r)  r*  rq  r   r_  sequence_scoresrh   r   r  r  )>r%  r&  r?  r@  r  r  r  r  r  r   
_beam_hypsr   r   r  r  rt   r  r   r   r   rq   r  r  rN  r  r   r   r   r
  r   rg  
zeros_likerj   r   r  extendr  r   r   r  r`  	expand_asr"  r   divr  processr   r}   rk   r~   r   rl   r   r  r  is_donerQ  finalizerq  r   r   )=r   r   r  ri  r  r   r  r  r)  r*  r?  r@  r  r  r  r  r  num_sub_beamsr   r   batch_beam_sizer*  r   ri   r  r}   r~   r   r{   r|   r]  r  r_  r  reordering_indicesr   r\  processed_scoreraw_logit_scoregroup_start_idxgroup_end_idx
group_sizebatch_group_indicesr  r*  group_input_idsr  r  ry  next_token_scores_processedr  r  next_indicesprocess_beam_indicesbeam_outputsbeam_next_tokensfinal_beam_indicessequence_outputsr  r  r   s=                                                             @@@rx   r  z"GenerationMixin._group_beam_search  s	   V )::(::-??0EE)77)77"3"K"K))	%55!_4//0OC
!!#,?? 77AQAQS_`"}Z_`oZpqUVE!PeMJ4N.O!PPqLLz!_4;I
<R;SS\]l\mmno 
 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " jj*i!8$ekkZ`a*+A&'!&&
Y(>'@A"&__Q/,,-?U^UeUe,f"[[i)?y_efN "'Z)-C5::^d!e >4==iX<XL L]!46G HcefRf!79M Nlno<\<t<G  CC#';;#A#A D L
 1!A+"'"2"27>>!R(3K"L #*..B":"="=4PYP`P`"="a"'"8 O"0="@ #Om$CY O*_<
 ')#!&z!2 I'..@EoWd@efY.4f #,,?"@
 %,NN3FA3M$N$Q$Q--	0@0@ %R %! %'MM$=$=%2 %> %! /44R8
.>#%6~ft/+ %@+NaBbBlBlmoBp$p!$5$?$?@[$\! ;VO$78 %6$:$::zT^G^$_! 9E8P|11!4VW16%s1a,.>'?*'LRS]ajn2.!;  %yyjPWX)J6 AM@Xs<'<^b$*22#% !-!-!5 .'9  3 
  4@@R3S/0#/0B#C '(;<*}38 9\abefrstfubv\w9 4L0 2A1J	-."'))_Xq[-IK[KeKefhKi,jpr"s6Eae6L23
 		(Jg VV%&*,. ##67WOd ' 00F ?"44J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) 		9n.F.Fr.J"KQSTI   148D4!126:6I6I$%679K7L!23 !!23AABTU kG""c*;Iv*N&O%)"w ,,-?U^UeUe,fz 7C6NSr2TX&//%%(33+1 0 

 # 6: !23{{--7.{;%56G%H!%!1.!A'9*?'9%5*?$0$4$45F$G  5.{;%56G%H!%!1.!A1"7$0$4$45F$G	 	 $K00Q rV gs   %^"^'r  c                 |  *+ |j                   }|j                  }	|j                  }
|j                  }|j                  }|j
                  }|j                  }t        |j                        }|j                  }|j                  dd \  }}| j                  ||j                  |      }||z  |k7  rt        d||z   d| d      |r|rdnd}|r|rdnd}|r|rt        d t        |      D              nd+|r|
rdnd}|r|
rdnd}|r|rdnd}|rF| j                   j"                  r0|
r|d   j%                  d	      nd}|r|d   j%                  d
      nd}t'        j(                  ||ft&        j*                  |j                        }d|ddddf<   |j-                  ||z  f      }d}|j                  d   }| j/                  |||j                        rF | j0                  |fi |}|j3                  |
rd|
ini        |j3                  |rd|ini         | di |ddi}| j5                  ||| j                   j"                        }|r|r|dz   }|j6                  dddddf   j9                  dt&        j:                  |j                        }t<        j>                  jA                  |d      }  |||       }!|!|dddf   jC                  |!      z   } | jE                         }"|r|r|| fz  }|r||fz  }|
rY|| j                   j"                  r|jF                  fn|jH                  fz  }| j                   j"                  r||jJ                  fz  }|r3|| j                   j"                  r|jL                  fn|jN                  fz  }| j                  d   }#| j-                  |||#z        } |	|	j                  d   nd}$t'        jP                  | tS        dd|$z         |z  ddd      \  } }%|%|#z  jU                         }&|%|#z  }%|jW                  || |%|&|"||	+|	      }'|'d   }|'d   }(|'d   *t'        jX                  |*ddf   |(j[                  d      gd      }~|j%                  dd      9t]        | d      r| j_                  |d   *      |d<   n|d   ja                  *       |r)|r't        *+fd t        t        +            D              +|dz   }|jb                  ste         |||            rd}| j/                  |||j                        rF|jg                  ||%&||	|jh                  +|!	      })|rv|sd|)d"<   | j                   j"                  r.tk        |)d#   |)d"   |||)d$   ||||j%                  d      %      S tm        |)d#   |)d"   |||)d$   |||j%                  d      &      S |)d#   S )'a|	  
        Generates sequences of token ids for models with a language modeling head using **constrained beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation, while satisfying a list of positive constraints. For more information, the
                documentation of [`ConstrainedBeamSearchScorer`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        Nr   r  r  r  rv   c              3       K   | ]  }d   ywr  rv   r  s     rx   r>  z;GenerationMixin._constrained_beam_search.<locals>.<genexpr>   s     5"5r  r  rk   rl   r   rd  r&   Fr  r?  r@  rA  Tr  r   r  rL  r   r  )r)  r*  r   r_  r  r  r  ra   r  c              3   <   K   | ]  }|      |   fz     y wr   rv   )r;  r  r  r   s     rx   r>  z;GenerationMixin._constrained_beam_search.<locals>.<genexpr>  s&     $rTU\(1+%>(1+%O$rs   r  r  rh   r   r  r  )7r%  r&  r?  r@  r  r  r  r   r  r  r   r  r   r  rt   r  r   r   r   rq   r   r  rN  r  r
  r   rg  rj   r   r  r   r   r  r  r   r}   rk   r~   r   rl   r"  r   r   r  r   r`  r   r  r  r  rQ  r  rq  r   r   ),r   r   r  ri  r  r   r  r  r)  r*  r?  r@  r  r  r  r   r  r  r*  ri   r  r}   r~   r   r{   r|   r]  r  r_  r   r\  r  r  r  scores_for_all_vocabry  r  r  r  r  r  r  r  r   s,                                             @@rx   r  z(GenerationMixin._constrained_beam_search  s   X )::(::-??0EE)77)77"3"K"K0;;<
+55	#,??2A#6 77AQAQS_`z!_4;I
<R;SS\]l\mmno 
 0M3RD
:QVcE5eO455jn 	 %<@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " kk:y"9U^UeUef!AqrE!&&
Y(>'@A"&__Q/,,-?U^UeUe,f=4==iX<XL L]!46G HcefRf!79M Nlno<\<t<G  CC#';;#A#A D L
 1!A+
 !(q"ax 8 ; ;U]]clcscs ; t " 9 9!r !: ! +;9FW*X' ;k!T'>R>\>\+? ! $5#:#:#<  ' 022F #4"66J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) +004J 1 6 6z9zCY Z 5A4L<--a0RSL-2ZZ!3q!l*:#;i#GQX\ei.*{ (*4::<L%
2K 3::!$)))#5 ; 
L ''9:K+,>?#$78H		9Xq[#9;K;U;UVX;Y"Z`bcI   148D4!126:6I6I,WhJiks6tL!23 !23AA(K&=$$rY^_bco_pYq$rr kG&..#6G	SY6Z2[%)"U ,,-?U^UeUe,fX 3;;%%(33%1 < 

 # 6: !23{{--7.{;%56G%H!%!1.!A'9*?'9%5*?$0$4$45F$G  5.{;%56G%H!%!1.!A1"7$0$4$45F$G	 	 $K00rw   r}  c                   *+ |j                   }	|j                  }
|j                  }|j                  }|j                  }|j
                  }|r|rdnd}|r|rdnd}|r|
rdnd}|r|
rdnd}|r|rdnd}|rF| j                  j                  r0|
r|d   j                  d      nd}|r|d   j                  d      nd}|j                  dd \  }}t        j                  |t        j                  |j                        }| j                  ||j                  |      }d}d	}| j                  |||j                  
      r|j                  d   }|j!                  |      \  }}|j#                  | j                        }||j#                  | j                        }|j                  d   |j                  d   z
  } ||d      }t%        j$                  |      }t'        ||j                  d   | j                  j                        }t)        ||j                  d         }d|v rQt        j*                  |d   t        j,                  |||z   |j                  t        j                        fd      |d<    | j.                  |fi |} d| v r|dz   | d<   | j1                  |
rd|
ini        | j1                  |rd|ini         | di | }!|!j2                  dd| dz
  df   j#                  t        j4                  |j                        **j7                         +t9        |      dkD  r<t;        |dz         D ]+  }" ||ddd||"z   f   *dd|"ddf         *dd|"ddf<   - |	r|t=        |||*|      \  }#}$n|	rJ*j?                  d      }%t        j@                  |%dddddf   d      jC                  d      dddf   }&n*jE                  d      }&|dd|df   }'|'|&ddddf   k(   jG                  d      dk  jI                         }$|r
|$|k(  r|$dz  }$|&ddd|$dz   f   }#t        j*                  ||#fd      }||jK                  |#jM                                |j                  d   }(|!jN                  jQ                  |(dz
         |jS                  |*|$       | jU                  |!|| j                  j                  |$dz         }|r|r|r|$dz   })|r |tW        *fdt;        |)      D              z  }|r |tW        +fdt;        |)      D              z  }|r|(n|)})|
rr| j                  j                  r3tY        ||!jZ                  ||)      }tY        ||!j\                  ||)d	      }n)|!j^                  d   tY        ||!j^                  ||)d	      }|rG| j                  j                  rtY        ||!j`                  ||)      }ntY        ||!jb                  ||)      }| |||       z  }|je                         dk(  }d}| j                  |||j                  
      r||jg                          ti        |d      rH|jj                  jl                  jn                  dk(  r%|jp                  |jj                  jl                  _8        |rY| j                  j                  r#ts        |||||||j                  d      	      S tu        ||||||j                  d            S |S )a
  
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
        models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            candidate_generator (`CandidateGenerator`):
                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
                more information, the documentation of [`CandidateGenerator`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        rv   Nr  rk   rl   r   r   FTr  r&   r   ri  r   rL  rd  r?  r@  r   r  )r   r]  c              3   6   K   | ]  }d d |d d f     y wr   rv   )r;  r  
new_logitss     rx   r>  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>  s     #[AJq!Qw$7#[   c              3   6   K   | ]  }d d |d d f     y wr   rv   )r;  r  r  s     rx   r>  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>  s     'fq(9!Q'(B'fr  )is_decoder_attentionrh  	heuristicra   r  r  );rv  r?  r@  r  r  r  r   r   r   r   rq   r   r   r   r  r  get_candidatesr   r  r3   r4   r   rb  r
  r   rj   r  r   r   r  _speculative_samplingr  r  r  r	  r   r  r  r  ra   r$  update_candidate_strategyrg  rt   _split_model_outputsr~   r}   rk   r   rl   r   r
  r   rh  r   num_assistant_tokens_schedulenum_assistant_tokensrz   rg   ),r   r   r}  ri  r  r   r  r  r  rv  r?  r@  r  r  r  ri   r  r}   r~   r   r{   r|   r   r*  r  r  is_first_iterationcandidate_input_idscandidate_logitscandidate_lengthis_done_candidatecandidate_kwargsr   r\  r  valid_tokens	n_matchesr  selected_tokenscandidate_new_tokensnew_cur_lennewly_added_lengthr  r  s,                                             @@rx   r  z"GenerationMixin._assisted_decoding  s   b &//	-??0EE)77)77"3"K"K 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G$zz*EJJyO_O_`77AQAQS_`"!,,-?U^UeUe,fooa(G 5H4V4VW`4a1!1"5"8"8"E+#3#6#6t{{#C 288;iooa>PP 12Et L  $yy66 "5";";A">@^@^   77GI\IbIbcdIef#335:YY()9:Wg8H.HQZQaQainisist 6 !12 >4==>QfUefL</1AA1E-. L]!46G HcefRf!79M Nlno*\*G !,<+<q+@+B(BCFFmmI,<,< G J !+ 0 0 2#$q(/!34 wA*:;NqR_T[^_T_R_O_;`blmnpqstmtbu*vJq!Qw'w -9*?'$$%+'i &..2.6E&+&7&7aAgTU&V&^&^_`&abfhibi&jO&0&7&7B&7&?O':1gh;'G$ 43B38O OPXX]_X`cddiik	 %6F)FNI.q/IM//AB 		9l";DI#\--/0#//!,K ##((q9  99)ZQZ[  CC#';;#A#A(1}	 D L 1 '%.]" e#[GYAZ#[[[F %'fERdLe'f"ffJ4F[L^"${{55+?,g.F.FQc,( .B.#66#.15.* !++A.:-A.#..#.15.* ({{550D173P3PRY[m1- 1E173H3H'Se1- $8;LYX^;_:_#_ !5!9!9!;q!@!&q ,,-?U^UeUe,ft LLN '):;#33EEccgrr $88  //AAV #{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  rw   c                    dt         j                  j                  _        |j                  }t        j
                  |d d d df   |d      }d|vrt        d      | j                  }| j                  ||      }|r| j                  |j                        }|j                  dd       }d}	|D ]  }
|	|
j                  d   z   }||d d d |f   |d<   t        j                  |	|t         j                  |
j                        |d	<   |d	   j!                  d      |d
<    | j"                  |
fi |} |di |ddi}|j$                  |d<   |}	 ||d<   |d	   dd  dz   |d	<   |j                  d
d       }|S )N@   r   rL  ra   z+Cannot use prefill chunking without a cacher   r   r   r   r   rA  Tr&   rv   )rq   _dynamor   cache_size_limitrL  r'  r  r   r  rK  rz  r   r   rb  r   r   r`  r
  ra   )r   r   r   r  
chunk_sizeinput_chunksrN  rO  r   r  input_chunkcurrent_lengthr   r\  r   s                  rx   rM  z!GenerationMixin._prefill_chunking  s    13-&99
 {{9QV#4jbIL0JKK;;LJ[\ 223D3S3STM%))*:DA' 	)K(;+<+<R+@@N)1??N?@R1S-.-2\\^5::kFXFX.L)* ,88H+I+S+STU+VL(=4==kZ\ZL#ElEEG.5.E.EL*+(K	)  *8%&)56F)G)Lq)P%&^T2rw   )NN)NNNN)NNNr   )r&   FN)Fr&   )NNNNNNNN)NF)NNNNNNNNNNNN)Trm   rn   ro   rp   r   r   r   r   PathLikery  r   r   rq   rr   ru   rt   r   r   r   r
  r  r  r  r  r8   r   r1  rG  r   r   rT  staticmethodr[  r    rg  rE   r/   r~  r  r  rZ   r  r  r  r  r  r  r  r  r  r,  rC  classmethodrM  rb  re  rt  r  no_gradGenerateOutputr   r  r  GenerateNonBeamOutputr  r  r  rS  rU  rZ  r  rh  rn  r~  r  r  GenerateBeamOutputr  r)   r  r+   r  r  rM  rv   rw   rx   r   r   b  sO   B LP,0A('/c2;;6F0G'HA( $D>A(
 
A(F (## (   1 12 ( !!1!12	 (
 
u  %"2"22	3 (D=(##=(   1 12=( !!1!12	=(
 
u  %"2"22	3=(D ,0595959[##[ "%[ !!1!12	[
   1 12[ !!1!12[~ *./3:>	A0&A0 u||,A0 tC$567	A0
 
u||Xc]Dell1B,CC	DA0J *./3:>	`&` u||,` tC$567	`
 
		`@"||" ," 38n	"
 
		"H'||' #3-	'
 ,' 
c3h'^ *.9/9/ 9/ 3,-	9/
 !&9/ &9/ 
uc5<<&7!88	99/v #(04 ' '  ' E,,- '
 
uc3h/	0 '  'L $)// 38n/ !	/
 / 
c3h/bN#+N# ##N# ||	N#
 +N# .N# 4N# 7N# N# 
N#f /3.2W[:> $156:AEc+c 'smc !++	c
 #+8S%,,4Gc4R+S"Tc ##67c c tCH~.c &ell3c )1(>c 
cR :>	%+% $$89% 56	% 
%N#/1EEF# .0DDE# 
"$88	9	#R 04!&x!<<x! ell#x! u||,	x!
 x! 
x!t</4S> /b*X6!r cgW/!)*:!;W/QYZ^Q_W/rvW/	%	&W/r8L htCsTWxDX?Y6Z L \@$'@58@IL@V[VbVb@	@D 
 
 
$y+y y +	y
 y y y 
yvZ$ Z 5959	PS+PS $,D>PS u||S012	PSd( (Rb (gk (T U]]_ *.8<:><@W[&*7;-16:AE-1)-R&R $$45R ##67	R
 $$89R #+8S%,,4Gc4R+S"TR d^R ""34R >*R &ell3R )1(>R %TNR "#R 
~u///	0R RhD t ]b]i]i nr & ]aS))S6>?X6YS			SjR##R 3S	>*R .	R
 0R ,R R !R 
$e&6&66	7Rh U]]_\##\ .\ 0	\
 ,\ \ >*\ 
$e&6&66	7\ \|F##F .F 0	F
 ,F F >*F 
$e&6&66	7FP H%,, H5<< H H
 JELL Jc Jc JV[VbVb J J
 ell %,, 5<<  & ,
-2\\,
"\\,
 \\,
  ,,	,

 ,
 ,
  ,
 dCi(,
 ,
 ,
\ M-2\\M,,M ,1<<M dCi(	M M,4Q$||4Q !<<4Q $ll	4Q
 4Q  4Q 4Q 4Q 4Q 4Q 4Q 
u||U\\5<<7	84QlLL !&L $)<<	L
 ,1<<L L 
u||U\\5<<7	8L,3F<<3F !&3F \\	3F
 3F ll3F $)<<3F .3\\3F  ,,3F ,1<<3F !<<3F 3F 3F  3F 3F  dCi(!3F" 
u||U\\5<<E	F#3FnN##N .N 0	N
 ,N N 
!5#3#33	4N`
e1##e1  e1 .	e1
 0e1 ,e1 e1N	n1##n1 "=n1 .	n1
 0n1 ,n1 n1 
!5#3#33	4n1`H##H 0H .	H
 0H ,H H >*H 
$e&6&66	7HT*5+;+; *P` *rw   r   c                 d   | dd| df   }|j                  d      }|ddt        j                  |      |f   j                  dd      }|j                  d      }|ddt        j                  |      |f   j                  dd      }	|	|z  }
t        j                  |
      }||
k  }| j                  d      dk  j                         }|r||k(  r|dz  }|ddd|dz   f   }||fS |j                  d   }|dd|ddf   }||k  rF|dd|ddf   }t        j                  ||z
  d      }|j                  |j                                n|}t        j                  |d      j                  d      dddf   }|dkD  r&t        j                  |ddd|f   |fd      }||fS |}||fS )a  
    Applies sampling as in the speculative decoding paper (https://huggingface.co/papers/2211.17192, algorithm 1). Returns
    the selected tokens, as well as the number of candidate matches.

    NOTE: Unless otherwise stated, the variable names match those in the paper.
    Nr   rL  r   r&   )r  r  )r  rq   rb  r  	rand_liker   r  r   clampdiv_r  r   )r  r  r  r  r  new_candidate_input_idsqq_ir<  p_iprobability_ratior_iis_acceptedr  r  gamma
p_n_plus_1
q_n_plus_1p_primer  s                       rx   r  r    s    2!6F5F5G2GH 	  R (A
Au||,-/FF
G
O
OPQST
UCr"A
Au||,-/FF
G
O
OPQST
UCc	
 //+
,C**K,&&2&.2779I Y*:: 	Q	.q/IM//AB& ""! !&&q)q)Q'
u1i?+Jkk:
#:CGLL' Gg15==a@qI q= 99&=a)m&La%PVXYL "" L""rw   c                 *   t        |       dk(  r<d}|D ]%  }|r|n|j                  d   }||dd|d|f   fz  }' | |fz  } |dz  }||z  }t        |      D ]:  }d}|D ]+  }|r||z   n|j                  d   }||d||dz   d|f   fz  }- | |fz  } < | S )z
    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
    where each member corresponds to a single generated token.
    r   rv   r   .Nr&   )r   r   r  )	r\  new_outputsr*  	added_lenr  	new_tupler+  last_dim_sizer  s	            rx   r  r  <  s     7|q	  	AE';GRM%XgX~~ =>@@I	A 	I<1W	9  	  	BE+?GaKU[[QS_M%QQY >?AAI	B 	I<  Nrw   r8  r6  next_top_k_probsr+  alpha
beam_widthr   c                 V   | | j                  dd      z  }||j                  dd      z  }t        j                  ||j                  dd            j	                  d      }|j                  |j                        }d|z
  t        j                  |j                        j                  z  }||z   }t        j                  |d      \  }	}
|j                  d      }d|z
  |z  ||	z  z
  }t        j                  t        j                  ||            }|j                  d      \  }
}|S )	a  
    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
    row in the batch.
    r   Trb  r&   r   r   rL  r  )normrq   matmulr  r  r   r   finfor  r   rN  r  r'  )r8  r6  r  r+  r  r  norm_context_hiddennorm_next_hiddencosine_matrixdegeneration_penaltyr   contrastive_scorer9  s                rx   r&  r&  V  s.    )>+>+>1d+>+SS"[%5%5!T%5%JJLL!46F6P6PQRTU6VW__`bcM ,..]5H5H.I00EKK@S@S4T4X4XX!$66M#ii2>!',,R0u(885CW;WWEKK0A:$NO'+++3OA|rw   r  r   c                    | st        d      t        | d         t        fd| D              st        d      d }j                  D ci c]#  }| || D cg c]  }t	        ||       c}      % }}} di |S c c}w c c}}w )z
    Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
    specific ModelOutput subclass from the list provided.
    zInput list is empty.r   c              3   6   K   | ]  }t        |        y wr   )r   )r;  objmodel_output_clss     rx   r>  z&stack_model_outputs.<locals>.<genexpr>  s     JSz#/0Jr  z4All elements in the list should be of the same type.c           	          t        d  D              ryt         d   t        j                        rt        j                   d      S t         d   t
              rht         d   d   t
              r)t         fdt        t         d               D              S t         fdt        t         d               D              S t         d   t        t        f      rt        j                         S t        dt         d                )z5
        Reverse of `_split` function above.
        c              3   $   K   | ]  }|d u  
 y wr   rv   )r;  datas     rx   r>  z7stack_model_outputs.<locals>._concat.<locals>.<genexpr>  s     -tt|-rn  Nr   rL  c           
   3   z   K   | ]1  t        fd t        t        d   d               D               3 yw)c              3   ~   K   | ]/  }t        j                  D cg c]
  }|   |    c}d        1 yc c}w wr   rL  Nrq   r   )r;  jr  r$  r  s      rx   r>  zAstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<genexpr>  s5     ja%))D$ADT!WQZ$AqIIj$As   =8
=r   N)rt   r  r   )r;  r  r$  s    @rx   r>  z7stack_model_outputs.<locals>._concat.<locals>.<genexpr>  s=       jSXY\]abc]def]gYhSijjs   7;c              3   x   K   | ],  }t        j                  D cg c]  }||   	 c}d        . yc c}w wr'  r(  )r;  r  r  r$  s      rx   r>  z7stack_model_outputs.<locals>._concat.<locals>.<genexpr>  s1     gaUYYD'ADQ'AqIIg'As   :5
:zUnexpected attribute type: )r'  r   rq   r  r   rt   r  r   r   r  rj  	TypeErrorr{  )r$  s   `rx   _concatz$stack_model_outputs.<locals>._concat  s     ---d1gu||,99Tq))Q'$q'!*e, "3tAw<0  
 gSXY\]abc]dYeSfgggQ#u.<<%%9$tAw-IJJrw   rv   )r  r{  rQ  __dataclass_fields__r   )r  r   r,  r  model_outputconcatenated_datar!  s         @rx   r%  r%  u  s    
 /00 M!,- JMJJOPPK4 "66 	
7OGL!,OPP  0/00 Ps   BB-	BBg?InfgMbPri   baseline_scoresrelative_topfilter_valuer  c                 z   | j                  d      }|j                  d      }t        j                  |d      \  }}	|d|dz
  f   }
t        j                  |d      j                  }|t        j                  |      z   }t        j                  |
|      }|j                  d      }||||k  <   ||||k  <   ||fS )a]  
    Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
    Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
    r   rL  T)
descending.r&   )	r  rq   sortr   r  nplogr  r`  )ri   r1  r2  r3  base_filter_valuer  scores_normalizedbaseline_scores_normalizedsorted_logitssorted_indices
min_thresh	probs_maxprobs_threshs                rx   _relative_top_filterrA    s     **r*2!0!<!<!<!D$)JJ/@T$R!M>s$6$::;J		+4;;Irvvl33L99Z6L))"-LCT0<?@:F',67888rw   r  r  r  c                    t        |       dk(  r|| d      }t        ||      \  }}||z
  }|S t        j                  | D cg c]  }||   	 c}d      }t	        j
                  |d      }t	        j
                  |d      }d|d d d d d f   |z   z  }	t	        j                  |d      }
t	        j                  |d      }t	        j                  |
d d d d d f   |	d      j                  d      }t	        j                  ||	d      j                  d      }d||z   z  }|j                  d      }| t        |j                         j                                  }||   }t        ||      \  }}||z
  }|S c c}w )Nr&   r   rL  r   g      ?none)	reduction)r   rA  rq   r  Fr  r  kl_divmeanr   r	  rR  )r  r  r  base_logitsrj   r  stacked_premature_layerssoftmax_mature_layersoftmax_premature_layersavg_distlog_softmax_mature_layerlog_softmax_premature_layerskl1kl2js_divspremature_layers                   rx   r  r    s   
 %&!+01KA1NO$8{$S!k+  %{{Sm+na,Fq,I+ntuv 99\r: yy)ArJ *4A:69QQRH  !}}\rB#$==1Ir#R  ((+D!QJ7V
T
Y
YZ\
]C
((/V
L
Q
QRT
UCS3YG ll2G0W^^5E5J5J5L1MNO,_=K 4\; OL+K'FMA ,os    E9)F)r  r   r   r  dataclassesr   typingr   r   r   r   r   numpyr7  rq   torch.distributeddistributedr  huggingface_hubr	   	packagingr
   r   torch.nnr   rE  cache_utilsr   r   r   r   r   r   configuration_utilsr   dynamic_module_utilsr   r   r   r   integrations.deepspeedr   integrations.fsdpr   masking_utilsr   modeling_outputsr   r   pytorch_utilsr   tokenization_utilsr   utilsr    r!   r"   r#   r$   r%   beam_constraintsr'   r(   beam_searchr)   r*   r+   r}  r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   continuous_batchingr:   logits_processr;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   r  rU   rV   rW   rX   rY   rZ   r[   modeling_utilsr\   tokenization_utils_baser]   	streamersr^   
get_loggerrm   r  accelerate.hooksr_   r`   r_  rg   rz   r   r   GreedySearchDecoderOnlyOutput"ContrastiveSearchDecoderOnlyOutputSampleDecoderOnlyOutput%ContrastiveSearchEncoderDecoderOutput GreedySearchEncoderDecoderOutputSampleEncoderDecoderOutputBeamSearchDecoderOnlyOutputBeamSampleDecoderOnlyOutputBeamSearchEncoderDecoderOutputBeamSampleEncoderDecoderOutputGreedySearchOutputSampleOutputBeamSearchOutputBeamSampleOutputContrastiveSearchOutputr  r  r  r   r  r  ru   rr   r  r   r&  r  r%  rA  r  r  rv   rw   rx   <module>r}     s      	  ! @ @     '   $  3  @ 6 5 F - /  G R R
 
 
  1      8   0A'			H	%E  M  M  MF ,M; ,M ,M^ (MK (M (MV 5M{ 5M 5Mt !: %> "3 (D %#?  9 ; ; !A !A ;=ZZ[ /1HHI79TTU 79TTU  EGi ij  79UUV 8:ZZ[ ,.@@A_Jo _JDU5#p4%%"" '' ((	
   >.1tK'8 .1BR .1Wb .1h  <-99&&9 9 	9 9 92, $S	, $S%*;*;%; <, ##, 	,rw   