
    rh;Z                        d dl Z d dlmZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$  ejJ                  e&      Z'	 	 	 d+dejP                  dejR                  dejR                  dejR                  deejR                     dee*   de*deejR                     fdZ+ G d dejP                        Z, G d de      Z-e G d d e             Z. ed!"       G d# d$e.             Z/ G d% d&ejP                        Z0 ed'"       G d( d)e.e             Z1g d*Z2y),    N)CallableOptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfigmodulequerykeyvalueattention_maskscalingdropout	head_maskc                 $   ||j                  d      dz  }t        j                  ||j                  dd            |z  }	|0|j                  dk(  r!|	|d d d d d d d |j
                  d   f   z   }	t        j                  j                  |	d      }	||	|j                  dddd      z  }	t        j                  j                  |	|| j                  	      }	t        j                  |	|      }
|
j                  dd      j                         }
|
|	fS )
N      r   r      )dimr   ptraining)sizetorchmatmul	transposendimshaper   
functionalsoftmaxviewr#   r-   
contiguous)r   r   r   r    r!   r"   r#   r$   kwargsattn_weightsattn_outputs              /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forwardr<   ,   s     **R.D(<<s}}Q':;gEL!n&9&9Q&>#nQ1o		"o5M&NN==((2(>L#innQAq&AA==((6??([L,,|U3K''1-88:K$$    c                   L    e Zd ZdZ	 	 	 	 	 	 ddededededededee   d	ee   f fd
Z	de
j                  dedefdZ	 	 	 dde
j                  dee
j                     dee
j                     dedee
j                  ee
j                     eee
j                        f   f
dZ xZS )VoxtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr#   
is_decoderbias	is_causal	layer_idxconfigc	                 z   t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        |/|r-t        j                  d| j                  j                   d       || _        t!        j"                  ||d      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r'   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrC   )super__init__r@   rA   r#   head_dimrF   
ValueErrorr"   rB   rD   loggerwarning_once	__class____name__rE   r   Lineark_projv_projq_projout_proj)
selfr@   rA   r#   rB   rC   rD   rE   rF   rO   s
            r;   rJ   zVoxtralAttention.__init__M   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	95Aii	94@ii	94@		)YTBr=   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )r6   rA   rK   r1   r7   )rV   rW   rX   rY   s       r;   _shapezVoxtralAttention._shapeu   s7    {{3GQQRSUVWbbddr=   hidden_statesr!   layer_head_maskoutput_attentionsreturnc                 N   |j                         \  }}}| j                  | j                  |      | j                  z  ||      }	| j                  | j	                  |      d|      }
| j                  | j                  |      d|      }t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  d||d|\  }}|j                  ||d      j                         }| j                  |      }||fS )z#Input shape: Batch x Time x Channelr&   eager              ?)r#   r"   r^   r$   )r.   r[   rT   r"   rR   rS   r<   rF   _attn_implementationr   r-   r#   reshaper7   rU   )rV   r\   r!   r]   r^   r8   rY   tgt_len_query_states
key_statesvalue_statesattention_interfacer:   r9   s                  r;   forwardzVoxtralAttention.forwardx   s!    (,,.Wa {{4;;}#=#LgWZ[[[]!;RE
{{4;;}#=r3G(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,/%%
 %
!\ "))#w;FFHmmK0L((r=   )rb   FTFNN)NNF)rP   
__module____qualname____doc__intfloatboolr   r   rJ   r/   Tensorr[   tuplerl   __classcell__rO   s   @r;   r?   r?   J   s   G  #'*.&C&C &C 	&C
 &C &C &C C=&C '&CPeU\\ eC ec e 2626"'))||)) !.)) "%,,/	))
  )) 
u||Xell3XeELL>Q5RR	S))r=   r?   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dedej                  f
dZ xZ	S )
VoxtralEncoderLayerrF   c                 h   t         |           |j                  | _        t	        | j                  |j
                  |j                  |      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r@   rA   r#   rF   )rI   rJ   d_modelr@   r?   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr#   r   activation_functionactivation_fnactivation_dropoutrQ   encoder_ffn_dimfc1fc2final_layer_normrV   rF   rO   s     r;   rJ   zVoxtralEncoderLayer.__init__   s    )nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r=   r\   r!   r]   r^   r_   c                    |}| j                  |      }| j                  ||||      \  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|j                  t        j                  k(  rEt        j                  |j                        j                  dz
  }t        j                   || |      }||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r\   r!   r]   r^   r+   i  )minmax)r   r}   r   r4   r#   r-   r   r   r   r   r   dtyper/   float16finfor   clamp)rV   r\   r!   r]   r^   residualr9   clamp_values           r;   rl   zVoxtralEncoderLayer.forward   sS   $ !11-@&*nn')+/	 '5 '
#| --mt||VZVcVc-d =0 --m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0%--/++m&9&9:>>EK!KKK<[YMl**r=   )F)
rP   rm   rn   r   rJ   r/   rs   rr   rl   ru   rv   s   @r;   rx   rx      sY    =} =. #()+||)+ )+ 	)+
  )+ 
)+r=   rx   c                   F    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZdZd Zy)VoxtralPreTrainedModelrF   modelTNpast_key_valuesc                 j   t        | j                  d      r| j                  j                  n| j                  j                  j                  }t	        |t
        j                  t
        j                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rJ|j                  j                  j                  d       |j                  j                  j                          y t	        |t
        j                        rf|j                  j                  j                  d|       |j                   2|j                  j                  |j                      j                          y y y )Ninitializer_rangerb   )meanstdrc   )hasattrrF   r   audio_config
isinstancer   rQ   Conv1dweightdatanormal_rC   zero_r~   fill_	Embeddingpadding_idx)rV   r   r   s      r;   _init_weightsz$VoxtralPreTrainedModel._init_weights   s3   
 t{{$78 KK))));; 	 fryy"))45MM&&CS&9{{&  &&( '-MM$$S)KK""$-MM&&CS&9!!-""6#5#56<<> . .r=   )rP   rm   rn   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraphr    r=   r;   r   r      sH    &*#"3N "&!?r=   r   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                        e Zd ZU dZeed<   dZdgZee	dZ
def fdZd Zdej                  fd	Zd
ej                  fdZe	 ddee   fd       Zdej,                  fdZ xZS )VoxtralEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`VoxtralEncoderLayer`].

    Args:
        config: VoxtralEncoderConfig
    rF   input_featuresrx   )
attentionsr\   c                    t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _        |j                  | _	        |j                  rt        j                  |      nd| _        t        j                  | j                  |dd      | _        t        j                  ||ddd      | _        t        j$                  | j                  |      | _        | j&                  j)                  d       t        j*                  t-        |j.                        D cg c]  }t1        |       c}      | _        t        j4                  |j
                        | _        t        j8                  dd      | _        d| _        | j?                          y c c}w )	Nrc   r   r   )kernel_sizepaddingr   )r   strider   F)r   ) rI   rJ   r#   encoder_layerdrop	layerdroprz   num_mel_binspad_token_idr   max_source_positionsscale_embeddingmathsqrtembed_scaler   r   conv1conv2r   embed_positionsrequires_grad_
ModuleListrangeencoder_layersrx   layersr~   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rV   rF   r@   rg   rO   s       r;   rJ   zVoxtralEncoder.__init__  sD    ~~11NN	"//!..$*$?$?!393I3I499Y/sYYt00)TUV
YYy)1VWX
!||D,E,EyQ++E2mm%PVPePeJf$gQ%8%@$gh,,v~~6,,q3&+# %hs   F=c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_grad_requires_grad)rV   params     r;   _freeze_parametersz!VoxtralEncoder._freeze_parameters8  s(    __& 	(E"'E	(#r=   r_   c                     | j                   S Nr   rV   s    r;   get_input_embeddingsz#VoxtralEncoder.get_input_embeddings=  s    zzr=   r    c                     || _         y r   r   rV   r    s     r;   set_input_embeddingsz#VoxtralEncoder.set_input_embeddings@  s	    
r=   r8   c           	         | j                   j                  | j                  j                  d   z  | j                  j                  d   z  }|j
                  d   |k7  r"t        d| d|j
                  d    d| d      |j                  | j                  j                  j                  | j                  j                  j                        }t        j                  j                  | j                  |            }t        j                  j                  | j	                  |            }|j                  ddd	      }| j                  j                  }||z   j                  |j                        }t        j                  j!                  || j                   | j"                  
      }t%        | j&                        D ]  \  }}	 |	||d      }
|
d   } | j)                  |      }t+        |      S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   r&   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r   devicer   r   r+   N)r!   r]   )last_hidden_state)rF   r   r   r   r   r3   rL   tor   r   r   r   r4   gelupermuter   r#   r-   	enumerater   r   r   )rV   r   r!   r8   expected_seq_lengthinputs_embeds	embed_posr\   idxencoder_layerlayer_outputss              r;   rl   zVoxtralEncoder.forwardC  s   & #kk>>ARARSTAUUX\XbXbXiXijkXll#'::LM`Laamn|  oC  oC  DF  oG  nH  Hu  vI  uJ  JK  L  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--mt||VZVcVc-d"+DKK"8 	-C)- $M
 *!,M	- 6+
 	
r=   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r   )rV   r   output_lengthss      r;    _get_feat_extract_output_lengthsz/VoxtralEncoder._get_feat_extract_output_lengthst  s7     '*q014'!+1A5n,,r=   r   )rP   rm   rn   ro   r   r   main_input_namer   r?   rx   _can_record_outputsrJ   r   r   Moduler   r   r   r   r   rl   r/   
LongTensorr   ru   rv   s   @r;   r   r     s     ! &O./&,
3 4$
bii "))   -
 +,	-
 -
`-e>N>N -r=   r   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorrF   c                 f   t         |           t        j                  |j                  j
                  |j                  j                  d      | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  d      | _        y )NFrH   )rI   rJ   r   rQ   r   intermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2r   s     r;   rJ   z#VoxtralMultiModalProjector.__init__~  sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr=   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rV   audio_featuresr\   s      r;   rl   z"VoxtralMultiModalProjector.forward  s2    n5/m4r=   )rP   rm   rn   r   rJ   rl   ru   rv   s   @r;   r   r   }  s    n} nr=   r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZddiZddgdgfiZdgZ fdZd Zd	 Z	d
 Z
d Zd Zd Zdej                  fdZee	 	 	 	 	 	 	 	 	 	 ddeej(                     deej                     deej*                     deej(                     dee   deej                     deej(                     dee   deej(                     deeej*                  f   dee   defd              Z fdZ xZS )VoxtralForConditionalGenerationzlm_head.weightlm_headcolwise_repr\   logitsr   c                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y r   )rI   rJ   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r;   rJ   z(VoxtralForConditionalGeneration.__init__  sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r=   c                 6    | j                   j                         S r   )r	  r   r   s    r;   r   z4VoxtralForConditionalGeneration.get_input_embeddings  s    ""7799r=   c                 :    | j                   j                  |       y r   )r	  r   r   s     r;   r   z4VoxtralForConditionalGeneration.set_input_embeddings  s    007r=   c                 6    | j                   j                         S r   )r	  get_output_embeddingsr   s    r;   r  z5VoxtralForConditionalGeneration.get_output_embeddings  s    ""88::r=   c                 :    | j                   j                  |       y r   )r	  set_output_embeddings)rV   new_embeddingss     r;   r  z5VoxtralForConditionalGeneration.set_output_embeddings  s    11.Ar=   c                 :    | j                   j                  |       y r   )r	  set_decoder)rV   decoders     r;   r  z+VoxtralForConditionalGeneration.set_decoder  s    ''0r=   c                 6    | j                   j                         S r   )r	  get_decoderr   s    r;   r  z+VoxtralForConditionalGeneration.get_decoder  s    ""..00r=   r   c                     | j                  |      }|j                  }|j                  d| j                  j                  j
                        }| j                  |      }|S )a  
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        r&   )r  r   re   rF   r   r   r
  )rV   r   audio_outputsaudio_hidden_statesaudio_embedss        r;   get_audio_embedsz0VoxtralForConditionalGeneration.get_audio_embeds  sZ     ((8+==199"dkk>V>V>h>hi112EFr=   	input_idsr!   position_idsr   r   labels	use_cachecache_positionlogits_to_keepr8   r_   c                     | | j                         |      }|/| j                  |      }|| j                  j                  k(  }|||<    | j                  d|||||||	|
d|}|S )ap  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```)r!   r  r   r   r  r  r   r!  r   )r   r  rF   audio_token_idr	  )rV   r  r   r!   r  r   r   r  r  r   r!  r8   r  audio_token_maskoutputss                  r;   rl   z'VoxtralForConditionalGeneration.forward  s    b  7D557	BM%00@L  )DKK,F,FF.:M*++>4+>+> 
,
)%+'))
,
 
,
 r=   c                     |j                  dd       }|j                  d      }t        |   |i |}||d   dk(  r||d<   |S )Nr   r   r   )popgetrI   prepare_inputs_for_generation)rV   argsr8   r   r   model_inputsrO   s         r;   r)  z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  s\      $4d;$45w<dMfM%.*;q*@-;L)*r=   )
NNNNNNNNNr   ) rP   rm   rn   _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictrJ   r   r   r  r  r  r  r/   FloatTensorr  r   r   r   r   rs   r	   rr   r   rp   r   r   r   rl   r)  ru   rv   s   @r;   r  r    s    ++=)H_-z:;H$5#6 :8;B11u/@/@ *  156:1537+/59-1$(5934DE,,-D !!2!23D !.	D
 u//0D "%D   1 12D ))*D D>D !!1!12D c5<</0D +,D 
 D  DL r=   r  )r   r   r  )Nrb   N)3r   typingr   r   r   r/   r   activationsr   cache_utilsr	   
generationr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   autor   r   configuration_voxtralr   r   
get_loggerrP   rM   r   rs   rq   r<   r?   rx   r   r   r   r  __all__r   r=   r;   <module>r?     s~  ,  , ,   !   ) 9 ` ` F & R R / 2 F 
		H	%  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<W)ryy W)t<+4 <+~  ?_  ?  ?F 
n-+ n-
n-b  
K&<o K
K\ Zr=   