
    rh]-                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlm Z   G d de      Z! G d de      Z" G d de      Z# ed       G d de             Z$ G d dejJ                        Z& ed       G d de#e
             Z'g d Z(y)!    )OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       e Zd Zy)VoxtralAttentionN__name__
__module____qualname__     ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   &       r"   r   c                       e Zd Zy)VoxtralEncoderLayerNr   r!   r"   r#   r&   r&   *   r$   r"   r&   c                   $    e Zd ZdZdZdZdZdZdZy)VoxtralPreTrainedModelTN)r   r   r    _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr!   r"   r#   r(   r(   .   s&     "&!"&r"   r(   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                   6    e Zd ZeedZe	 ddee   fd       Z	y)VoxtralEncoder)
attentionshidden_statesNkwargsc           	         | j                   j                  | j                  j                  d   z  | j                  j                  d   z  }|j
                  d   |k7  r"t        d| d|j
                  d    d| d      |j                  | j                  j                  j                  | j                  j                  j                        }t        j                  j                  | j                  |            }t        j                  j                  | j	                  |            }|j                  ddd	      }| j                  j                  }||z   j                  |j                        }t        j                  j!                  || j                   | j"                  
      }t%        | j&                        D ]  \  }}	 |	||d      }
|
d   } | j)                  |      }t+        |      S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptrainingN)attention_masklayer_head_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr7   r8   r   
functionalgelupermuteembed_positionsdropoutr:   	enumeratelayers
layer_normr
   )selfinput_featuresr;   r3   expected_seq_lengthinputs_embeds	embed_posr2   idxencoder_layerlayer_outputss              r#   forwardzVoxtralEncoder.forwardC   s   & #kk>>ARARSTAUUX\XbXbXiXijkXll#'::LM`Laamn|  oC  oC  DF  oG  nH  Hu  vI  uJ  JK  L  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--mt||VZVcVc-d"+DKK"8 	-C)- $M
 *!,M	- 6+
 	
r"   N)
r   r   r    r   r&   _can_record_outputsr   r   r   rW   r!   r"   r#   r0   r0   8   s;     ',
  -
 +,	-
 -
r"   r0   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorr>   c                 f   t         |           t        j                  |j                  j
                  |j                  j                  d      | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  d      | _        y )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rO   r>   	__class__s     r#   r_   z#VoxtralMultiModalProjector.__init__u   sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr"   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rX   )re   rg   rh   )rO   audio_featuresr2   s      r#   rW   z"VoxtralMultiModalProjector.forward{   s2    n5/m4r"   )r   r   r    r   r_   rW   __classcell__rj   s   @r#   r[   r[   t   s    n} nr"   r[   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZddiZddgdgfiZdgZ fdZd Zd	 Z	d
 Z
d Zd Zd Zdej                  fdZee	 	 	 	 	 	 	 	 	 	 ddeej(                     deej                     deej*                     deej(                     dee   deej                     deej(                     dee   deej(                     deeej*                  f   dee   defd              Z fdZ xZS )VoxtralForConditionalGenerationzlm_head.weightlm_headcolwise_repr2   logitsrJ   c                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y rX   )r^   r_   rc   
vocab_sizer   from_configra   audio_towerr   language_modelr[   multi_modal_projector	post_initri   s     r#   r_   z(VoxtralForConditionalGeneration.__init__   sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r"   c                 6    | j                   j                         S rX   )rx   get_input_embeddingsrO   s    r#   r|   z4VoxtralForConditionalGeneration.get_input_embeddings   s    ""7799r"   c                 :    | j                   j                  |       y rX   )rx   set_input_embeddings)rO   values     r#   r   z4VoxtralForConditionalGeneration.set_input_embeddings   s    007r"   c                 6    | j                   j                         S rX   )rx   get_output_embeddingsr}   s    r#   r   z5VoxtralForConditionalGeneration.get_output_embeddings   s    ""88::r"   c                 :    | j                   j                  |       y rX   )rx   set_output_embeddings)rO   new_embeddingss     r#   r   z5VoxtralForConditionalGeneration.set_output_embeddings   s    11.Ar"   c                 :    | j                   j                  |       y rX   )rx   set_decoder)rO   decoders     r#   r   z+VoxtralForConditionalGeneration.set_decoder   s    ''0r"   c                 6    | j                   j                         S rX   )rx   get_decoderr}   s    r#   r   z+VoxtralForConditionalGeneration.get_decoder   s    ""..00r"   rP   c                     | j                  |      }|j                  }|j                  d| j                  j                  j
                        }| j                  |      }|S )a  
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        r5   )rw   r=   reshaper>   ra   rb   ry   )rO   rP   audio_outputsaudio_hidden_statesaudio_embedss        r#   get_audio_embedsz0VoxtralForConditionalGeneration.get_audio_embeds   sZ     ((8+==199"dkk>V>V>h>hi112EFr"   	input_idsr;   position_idspast_key_valuesrR   labels	use_cachecache_positionlogits_to_keepr3   returnc                     | | j                         |      }|/| j                  |      }|| j                  j                  k(  }|||<    | j                  d|||||||	|
d|}|S )ap  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```)r;   r   r   rR   r   r   r   r   r!   )r|   r   r>   audio_token_idrx   )rO   r   rP   r;   r   r   rR   r   r   r   r   r3   r   audio_token_maskoutputss                  r#   rW   z'VoxtralForConditionalGeneration.forward   s    b  7D557	BM%00@L  )DKK,F,FF.:M*++>4+>+> 
,
)%+'))
,
 
,
 r"   c                     |j                  dd       }|j                  d      }t        |   |i |}||d   dk(  r||d<   |S )NrP   r   r   )popgetr^   prepare_inputs_for_generation)rO   argsr3   rP   r   model_inputsrj   s         r#   r   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  s\      $4d;$45w<dMfM%.*;q*@-;L)*r"   )
NNNNNNNNNr   ) r   r   r    _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictr_   r|   r   r   r   r   r   torchFloatTensorr   r   r   r   
LongTensorTensorr   boolr   intr   r   r   rW   r   rm   rn   s   @r#   rp   rp      s    ++=)H_-z:;H$5#6 :8;B11u/@/@ *  156:1537+/59-1$(5934DE,,-D !!2!23D !.	D
 u//0D "%D   1 12D ))*D D>D !!1!12D c5<</0D +,D 
 D  DL r"   rp   )r(   r0   rp   ))typingr   r   r   r   activationsr   cache_utilsr   
generationr	   modeling_outputsr
   r   r   processing_utilsr   utilsr   r   r   utils.genericr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r&   r(   r0   Moduler[   rp   __all__r!   r"   r#   <module>r      s     #   !   ) ` ` & I I / 2  1	* 		0 	6  
4
& 4

4
n  
K&<o K
K\ Zr"   