
    rhR                     B   d dl Z d dlZd dlmZmZ ddlmZmZmZm	Z	  e       rd dl
Z
 e       rd dlZ e       rd dlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ  e	j<                  e      Z  G d
 ded      Z! G d ded      Z" G d de      Z#dgZ$y)    N)OptionalUnion   )is_mistral_common_availableis_soundfile_availableis_torch_availablelogging)TranscriptionRequest)
AudioInputload_audio_asmake_list_of_audio)BatchFeature)AllKwargsForChatTemplateAudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                       e Zd ZU ee   ed<   y)VoxtralAudioKwargsmax_source_positionsN)__name__
__module____qualname__r   int__annotations__     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/voxtral/processing_voxtral.pyr   r   )   s    "3-'r   r   F)totalc                   0    e Zd Zddidddddddddd	d
Zy)VoxtralProcessorKwargspaddingTi>  Fi S i  )sampling_rater$   
truncationpad_to_multiple_ofr   pt)return_tensorsreturn_dicttokenize)text_kwargsaudio_kwargscommon_kwargsN)r   r   r   	_defaultsr   r   r    r#   r#   -   s;     t
 #"($(
 #
Ir   r#   c                   4    e Zd ZdZddgZdZdZ fdZd Zde	e
eeef      e
e
eeef         f   d	ee   d
efdZdee	eee
e   e
e   f      d	ee   fdZ	 	 dde	ee
e   f   de	ee
e   ef   dedee   dee	ee
e   f      d	ee   fdZd Zd Zd Z xZS )VoxtralProcessora  
    Constructs a Voxtral processor which wraps [`WhisperFeatureExtractor`] and
    [`MistralCommonTokenizer`] into a single processor that inherits both the audio feature extraction and
    tokenizer functionalities.

    Args:
        feature_extractor ([`WhisperFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`MistralCommonTokenizer`]):
            The tokenizer is a required input.
    feature_extractor	tokenizerWhisperFeatureExtractorMistralCommonTokenizerc                 t    d| _         |j                  | j                         | _        t        |   ||       y )N   )audio_token_idconvert_ids_to_tokensaudio_tokensuper__init__)selfr2   r3   	__class__s      r    r<   zVoxtralProcessor.__init__R   s6    
 !$::4;N;NO*I6r   c                     g }|D ]`  } | j                   |fi |}|d   j                  | j                   j                  d|      }|j                  |j	                  dd             b t        j                  |      S )aX  
        Handles specific logic of Voxtral expected input features: audio arrays should be padded to next multiple of 480000 (duration is a multiple of 30s), see VoxtralProcessorKwargs' default audio_kwargs.
        Then mel input features are extracted and stacked along batch dimension, splitting into chunks of max_source_positions.
        input_featuresr      )r2   reshapefeature_sizeappend	transposetorchcat)r=   audior   kwargsinput_features_listaudio_arrayaudio_inputsr@   s           r    _retreive_input_featuresz)VoxtralProcessor._retreive_input_features\   s    
 !  	GK1411+HHL **:;CC&&33R9MN  &&~'?'?1'EF	G yy,--r   conversationrJ   returnc                    j                  dd      r:j                  dd      rt        d      j                  dd      rt        d      i i d}|D ]r  }t        j                  |   j                  D ]P  }t        j                  |   }t	        ||d      }j                  ||      }|8t        |t              rI|||   |<   R t |d	   j                         t        |t        t        f      r-t        |d
   t        t        f      st        |d
   d      rd}	|}
nd}	|g}
|d   }t        fd|D              r|D cg c]	  }|v s| }}t        j                  t        |      dk(  r|d
   ndj!                  |       dt        |      dkD  rdnd dt        |      dkD  rdnd dt        |      dkD  rdnd dt        |      dkD  rdnd d
        | j"                  t$        fi }|d   }|d   }|d    }|j                  d!d      }|d"k7  r"t        | j&                  j(                   d#      i |d	   |}d|d!<   |j                  d$d      }|j                  d%d      } | j*                  j,                  |
f||d&|}|rV|rT|j                  d'd      }t        |      }|(|j                  d(      } | j.                  ||fi ||d)<   t1        ||*      S |	s|d
   S |S c c}w )+a  
        This method applies the model's chat completion template given a conversation. It relies on MistralCommonTokenizer's
        [`~MistralCommonTokenizer.apply_chat_template`] to prepare input ids to the model and on WhisperFeatureExtractor's
        [`~WhisperFeatureExtractor.__call__`] to prepare input features to the model.

        Note that audio is padded to the nearest 30-second multiple prior to mel feature extraction.

        A `conversation` is a list of messages, where each message is a dictionary with a `role` and a `content` field.
        For Voxtral, `role` can be `"user"` or `"assistant"`.
        The `content` field can be a string or a list of dictionaries with a `type` field. See example below.

        ```python
        from huggingface_hub import hf_hub_download
        from transformers.audio_utils import load_audio_as

        audio_url = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3"
        audio_path = hf_hub_download(repo_id="hf-internal-testing/dummy-audio-samples", filename="bcn_weather.mp3", repo_type="dataset")
        audio_base64 = load_audio_as(audio_path, return_format="base64", force_mono=True)

        # audio + text
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "audio", "url": audio_url},
                    {"type": "audio", "path": audio_path},
                    {"type": "audio", "base64": audio_base64},
                    {"type": "text", "text": "How many audio do you hear?"},
                ],
            },
        ]

        processor = VoxtralProcessor.from_pretrained("mistralai/Voxtral-Mini-3B-2507")
        inputs = processor.apply_chat_template(conversation)
        ```

        Args:
            conversation (`Union[list[Dict, [str, str]], list[list[dict[str, str]]]]`):
                The conversation to format.
        continue_final_messageFadd_generation_prompta  continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead.return_assistant_tokens_maskzKcontinue_final_message is not compatible with return_assistant_tokens_mask.)mm_load_kwargstemplate_kwargsNrV   r   contentTrU   c              3   &   K   | ]  }|v  
 y wNr   ).0keyrJ   s     r    	<genexpr>z7VoxtralProcessor.apply_chat_template.<locals>.<genexpr>   s     7sf}7s   rB   z, z load multimodal data kwargs  havehasz# been passed to the processor, but zthey arezit iszP not supported for VoxtralProcessor since it relies on mistral_common directly. TheyItz will be ignored.r,   r-   r.   r)   r(   % only supports `return_tensors='pt'`.r+   r*   )r+   r*   rI   r   r@   datatensor_type)get
ValueErrorr   r   getattrpop
isinstancedictupdatelisttuplehasattranyloggerwarninglenjoin_merge_kwargsr#   r>   r   r3   apply_chat_templaterN   r   )r=   rO   rJ   processed_kwargs
kwarg_typer[   kwarg_type_defaultsdefault_valuevalue
is_batchedconversationsrU   overlapping_keysoutput_kwargsr,   r-   r.   r)   tokenizer_kwargsr+   r*   encoded_instruct_inputsrI   rf   r   s     `                      r    rx   z$VoxtralProcessor.apply_chat_templatem   s!   Z ::.6zz159  c  zz8%@ !noo !!

 + 	>J/??
K[[ >&>&N&Nz&Z# '(;S$ G

36$Zt-D8=$Z05>	> 	*+226:lT5M2|Au6',q/S\:]J(MJ)NM **:;777/=OOONN*-.>*?1*D#A&$))TdJef  gB  JM  N^  J_  bc  Jc  CF  ik  Bl  lm  x{  |L  xM  PQ  xQ  nt  W\  m]  ]@  OR  Sc  Od  gh  Oh  AK  nu  @v  vF  QT  Ue  Qf  ij  Qj  GM  pt  Fu  uF  G +**"

 $M2$^4%o6&**+;TBT! 7 788]^__Q./@AQ[Q-1)*#''
E:&**=%@"D$.."D"D#
##
 	#
 /33GTB34$+7+;+;<R+S(-JT-J-J5Rf-wjv-wD)*#>JJ*1--&&U  Ps   	K+K+textc                 $    t        |t              r|g}t         fd|D              rt         j                   d        j
                  t        fi |}|d   }|d   }  j                  |fi |}t        ||j                  dd            S )a  
        Method to prepare text to be fed as input to the model. This method forwards the `text`
        arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.__call__`] to encode
        the text. Please refer to the docstring of the above methods for more information.
        This methods does not support audio. To prepare the audio, please use:
        1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
        2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_features** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        c              3   :   K   | ]  }j                   |v   y wrY   )r:   )rZ   tr=   s     r    r\   z,VoxtralProcessor.__call__.<locals>.<genexpr>  s     3t1$3s   z is present in the provided text which is not supported by VoxtralProcessor. Please use the `apply_chat_template` method instead.r,   r.   r)   Nre   )
rl   strrr   ri   r:   rw   r#   r3   r   rk   )r=   r   rJ   r   r,   r.   outs   `      r    __call__zVoxtralProcessor.__call__   s    B dC 6D3d33##$  %f  g  +**"

 $M2%o6dnnT1[1-2C2CDTVZ2[\\r   languagerI   model_idr%   formatc           	           | j                   t        fi |}|d   }|d   }	|d   }
t        |t              }t	        d |D              }|xs | }|r<|t
        j                  d|	d    d       n||	d   k7  rt        d	| d
|	d    d      |	d   }|
j                  dd      }|
j                  dd      }dD ]&  }|j                  |d       |	j                  |d       ( |
j                  dd      }|dk7  r"t        | j                  j                   d      |rt        |dd|      g}n|r|D cg c]  }t        |dd|       }}nt        |      }t        |      t        |      k7  r$t        dt        |       dt        |       d      g }t        ||      D ]x  \  }}t        j                          }|j"                  dk(  r|j%                  d      }t'        j(                  |||	d   |       |j+                  d       |j-                  |       z |}t        |      }t        |t              r|g|z  }t        |      |k7  rt        dt        |       d| d      g }g }g }t        ||      D ]  \  }}|||d }t/        j0                  |      }| j2                  j2                  j5                  |      }|j-                  |j6                         |j-                  |j8                         |j;                  |j<                  D cg c]  }|j>                   c}        |rW|rU | j2                  |fd!di|} tA        |       }!|	j                  d"      }" | jB                  ||"fi |	|!d#<   tE        |!|$      S |S c c}w c c}w )%a  
        This method applies the model's transcription request template given a language and audio.
        It relies on MistralCommonTokenizer and WhisperFeatureExtractor to prepare input ids and input features to the model.

        ```python
        from transformers import VoxtralProcessor

        model_id = "mistralai/Voxtral-Mini-3B-2507"
        processor = VoxtralProcessor.from_pretrained(model_id)

        language = "en"
        audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"

        inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
        ```

        Args:
            language (`str`, `list[str]`):
                The language or languages of the audio. If provided as a string, will be applied uniformly to all audio.
                If provided as a list, will be applied to each audio individually with a one-to-one mapping.
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The audio or batch of audio to be prepared. If provided as a string, it should correspond to the path or url of the audio file.
            model_id (`str`:
                The hub model id of the model to use for transcription.
            sampling_rate (`int`, *optional*):
                The sampling rate of the audio. Necessary if it is provided as `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`.
                Used to avoid silent errors when passing audio that is not in the expected sampling rate.
            format (`str`, `list[str]`, *optional*):
                The format of the audio, necessary if is provided as `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`.
        r,   r-   r.   c              3   <   K   | ]  }t        |t                y wrY   )rl   r   )rZ   els     r    r\   z?VoxtralProcessor.apply_transcription_request.<locals>.<genexpr>P  s     ARZC0As   NzUYou've provided audio without specifying the sampling rate. It will be assumed to be r%   z$, which can result in silent errors.z The sampling rate of the audio (z5) does not match the sampling rate of the processor (zD). Please provide resampled the audio to the expected sampling rate.r*   Fr+   )r*   r+   r)   r(   rd   bufferT)return_format
force_monor%   z,When passed as a list of audio, the length (z#) must match the number of format ()   rB   )axis)
samplerater   r   z0When passed as a list of languages, the length (z") must match the number of audio ()modelfiler   add_special_tokensr   r@   re   )#rw   r#   rl   r   allrs   warning_onceri   rk   r>   r   r   r   ru   zipioBytesIOndimmeansfwriteseekrE   r
   from_openair3   encode_transcriptiontokensr   extendaudiosrL   rm   rN   r   )#r=   r   rI   r   r%   r   rJ   r   r,   r-   r.   is_stris_list_of_stris_list_of_audior*   r+   kr)   r   audio_buffersarrayfr   n_audio	input_idstextsaudio_arraysaudio_ellanguage_elopenai_transcription_requesttranscription_requesttokenized_transcription_requestencodingrf   r   s#                                      r    apply_transcription_requestz,VoxtralProcessor.apply_transcription_request   sC   N +**"

 $M2$^4%o6E3'A5AA & 8.9$##klx  zI  mJ  lK  Ko  p ,"?? 6}oEz  |H  IX  |Y  {Z  Z^  _  %_5#''u= $$Z7 - 	&AOOAt$Q%	& '**+;TBT! 7 788]^__ "5TanopErwlnbTYfgE  'u-E5zS[( B3u:,Nqruv|r}q~~  A  Mv. 	-q::?!JJAJ.E<3PYZ[A$$V,	- "E e*h$ zG+Hx=G#B3x=/Qst{s||}~  	%(%9 	c!Hk! ',( %9$D$DEa$b!.2nn.F.F.[.[\q.r+<CCDLL8==>:Y:`:` aB ab	c )4>>', "
 H~ (4'7'78N'O$)F)F)F "6*:F*%& $>JJG^ !bs   4NN
c                 Z    t        j                  dt                | j                  |i |S )zV
        Deprecated typo'd method. Use `apply_transcription_request` instead.
        z`apply_transcrition_request` is deprecated due to a typo and will be removed in a future release. Please use `apply_transcription_request` instead.)warningswarnFutureWarningr   r=   argsrJ   s      r    apply_transcrition_requestz+VoxtralProcessor.apply_transcrition_request  s3     	 b	
 0t//@@@r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r3   batch_decoder   s      r    r   zVoxtralProcessor.batch_decode  s     
 +t~~**D;F;;r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r3   decoder   s      r    r   zVoxtralProcessor.decode  s     
 %t~~$$d5f55r   )NN)r   r   r   __doc__
attributesfeature_extractor_classtokenizer_classr<   rN   r   ro   rm   r   r   r   rx   r   r   r   r#   r   r   r   r   r   r   r   __classcell__)r>   s   @r    r1   r1   A   sU   
 &{3J7.O7."|'Dc3h0$tDcN7K2LLM|' 12|' 
	|'|2]uY(94	?DQbLccde2] /02]t (,26QT#Y'Q S$s)Z/0Q 	Q
  }Q sDI~./Q /0QhA<6r   r1   )%r   r   typingr   r   utilsr   r   r   r	   rG   	soundfiler   -mistral_common.protocol.transcription.requestr
   audio_utilsr   r   r   feature_extraction_utilsr   processing_utilsr   r   r   r   r   tokenization_utils_baser   r   
get_loggerr   rs   r   r#   r1   __all__r   r   r    <module>r      s     
  " e e  R H H 4 o o C 
		H	%(E (-U (I6~ I6X 
r   