
    rh+?                         d Z ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ  e       rddlZ ej(                  e      Z G d
 de      ZdgZy)z%
Feature extractor class for Whisper
    )OptionalUnionN   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                       e Zd ZdZdgZ	 	 	 	 	 	 	 	 d fd	Zdej                  dedej                  fdZ
ddej                  dedej                  fd	Ze	 dd
eej                     deej                     dedeej                     fd       Z	 	 	 	 	 	 	 	 	 	 ddeej                  ee   eej                     eee      f   dedee   deeeef      dee   dee   dee   dee   dee   dee   dee   defdZ xZS )WhisperFeatureExtractora  
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
    input_featuresc	           	          t        
|   d||||d|	 || _        || _        || _        ||z  | _        | j
                  |z  | _        || _        || _        t        d|dz  z   |dd|dd      | _
        y )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask              g     @@slaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   ditherr   mel_filters)selfr   r   r$   r%   r#   r   r(   r   kwargs	__class__s             /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr"   z WhisperFeatureExtractor.__init__H   s     	 	
%''"7		

 	
 
$(%5!^^z9** 5A:~( '
    waveform_batchdevicereturnc                    |dk7  rt        d| d      g }|D ]  }t        |t        | j                  d      | j                  | j                  d| j
                  | j                  d      }|dddd	f   }t        j                  ||j                         d
z
        }|dz   dz  }|j                  |        t        j                  |      }|S )z
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/hanng       @log10)frame_lengthr$   powerr(   r)   log_melN       @      @)
ValueErrorr   r	   r#   r$   r(   r)   npmaximummaxappendarray)r*   r/   r0   log_spec_batchwaveformlog_specs         r-   _np_extract_fbank_featuresz2WhisperFeatureExtractor._np_extract_fbank_featuresl   s    
 U?vh 'q q 
 & 	,H"

F3!ZZ??{{ ,,	H  3B3'Hzz(HLLNS,@AH 3#-H!!(+	, .1r.   rC   c                 8   t        j                  |      j                  |t         j                        }t        j                  | j
                  |      }| j                  dk7  rF|| j                  t        j                  |j                  |j                  |j                        z  z  }t        j                  || j
                  | j                  |d      }|dddf   j                         d	z  }t        j                  | j                        j                  |t         j                        }|j                  |z  }t        j                   |d
      j#                         }|j%                         d	k(  rD|j'                  d	d      d   j'                  dd      d   }	t        j(                  ||	dz
        }n't        j(                  ||j'                         dz
        }|dz   dz  }|dk7  r|j+                         j-                         }|j/                         S )z
        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
        yielding results similar to cpu computing with 1e-5 tolerance.
        )r0   r   )dtyper0   T)windowreturn_complex.Nr9   r   g|=)min)dimkeepdimr   r   r:   r;   r3   )torch
from_numpytofloat32hann_windowr#   r(   randnshaperG   r0   stftr$   absr)   Tclampr5   rK   r?   r>   detachr3   numpy)
r*   rC   r0   rH   rT   
magnitudesr)   mel_specrD   max_vals
             r-   _torch_extract_fbank_featuresz5WhisperFeatureExtractor._torch_extract_fbank_features   s   
 ##H-00G""4::f=
 ;;#ekk(.._g_n_n&oooHzz(DJJ_cd#ss(^'')Q.
&&t'7'78;;FEMMR==:-;;xU399;<<>Qllq$l7:>>1d>STUVG}}Xw}=H}}Xx||~/CDHsNc)U?(,,.H~~r.   input_valuesattention_maskr   c                    |t        j                  |t         j                        }g }t        | |j	                  d            D ]m  \  }}||d| j                         z
  t        j                  |d| j                         dz         z  }||j                  d   k  r|||d |j                  |       o |S | D cg c]<  }||j                         z
  t        j                  |j                         dz         z  > }}|S c c}w )z[
        Every array in the list is normalized to have zero mean and unit variance
        Nr9   gHz>r   )
r=   rA   int32zipsummeansqrtvarrS   r@   )r^   r_   r   normed_input_valuesvectorlengthnormed_slicexs           r-   zero_mean_unit_var_normz/WhisperFeatureExtractor.zero_mean_unit_var_norm   s    %XXnbhh?N"$"%lN4F4Fr4J"K 9 &)=)=)? ?2776RYSY?K^K^K`cgKgChhL..q11,9L)#**<89 #" Vb"bPQALBGGAEEGdN4K#K"b"b"" #cs   :AC?
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizereturn_token_timestampsc                    |O|| j                   k7  rmt        d| j                  j                   d| j                    d| j                    d| d	      t        j                  d| j                  j                   d       t        |t        j                        xr t        |j                        d	kD  }|r&t        |j                        d
kD  rt        d|        |xs@ t        |t        t        f      xr( t        |d   t        j                  t        t        f      }|r>|D cg c]2  }t        j                  |gt        j                        j                  4 }}n|s@t        |t        j                        s&t        j                  |t        j                        }nht        |t        j                        rN|j                   t        j                   t        j"                        u r|j%                  t        j                        }|s!t        j                  |g      j                  g}t'        d|i      }| j)                  |||r|n| j*                  |||xs |	      }|	rD| j-                  |d   |d   | j.                        |d<   t        j0                  |d   d      |d<   |j3                  d      j5                  d
dd	      }t7               r| j8                  n| j:                  } ||d   |
      }t        |d   t              r7|D cg c]'  }t        j                  |t        j                        ) c}|d<   n||d<   |r|d   dddd| j<                  f   |d<   |Vt        j?                  d| j                  j                   d       |D cg c]  }t        |      | j<                  z   c}|d<   ||jA                  |      }|S c c}w c c}w c c}w )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   )rG   r   )rr   rm   ro   rp   r   r_   )r_   r   )axisz,`return_token_timestamps` is deprecated for z~ and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
num_frames)!r   r<   r,   __name__loggerwarning
isinstancer=   ndarraylenrS   listtupleasarrayrP   rV   rG   float64astyper   padr&   rl   r   stackget	transposer   r]   rE   r$   warning_onceconvert_to_tensors)r*   rn   ro   rp   rq   r   rr   rm   r   rs   r0   rt   r+   is_batched_numpy
is_batchedspeechbatched_speechpadded_inputsr   extract_fbank_featuresfeatureraw_speech_is                         r-   __call__z WhisperFeatureExtractor.__call__   s   H $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  NNVW[WeWeWnWnVo p\ \
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 Q[\v"**fXRZZ@BB\J\Jz2::$FJbjjAJ
BJJ/J4D4DQSQ[Q[H\4\#**2::6J **j\2445J%'7&DE %/zT^^!1"7"G< ! 
 .2.J.J./,-=>"00 /K /M*+
 /1hh}EU7V]^._M*+ '**+;<FFq!QO 3E2FD..DLkLk 	 0q0A6JnQ'.dr.sY`rzz'/T.sM*+ /=M*+ .;<L.MaQcTXTcTcQcNc.dM*+".>t~~?V?V>W  XV  W eo*oT`3|+<+O*oM,'%)<<^LMw ]R /t +ps   %7O,OO)P   i>        i  r   r   F)r3   )r   )
TNNNrm   NNNr3   N)ry   
__module____qualname____doc__model_input_namesr"   r=   rA   strr}   rE   r]   staticmethodr   floatrl   r   boolr   intr   r   r   __classcell__)r,   s   @r-   r   r   $   s   B ** #"
H 3 SUS]S] < bhh    XZXbXb  >  be#2::&#8<RZZ8H#Y^#	bjj	# #0  ,0;?04!-$('+'+ %26T"**d5k4

3CT$u+EVVWT T %SM	T
 !sJ!78T  (~T #T SMT  }T tnT T "*$T 
Tr.   r   )r   typingr   r   rY   r=    r   audio_utilsr   r   r	   !feature_extraction_sequence_utilsr
   feature_extraction_utilsr   utilsr   r   rM   
get_loggerry   rz   r   __all__r    r.   r-   <module>r      s[    #  " H H I 4 ( 			H	%p6 pf	 %
%r.   