
    rh;                     &   d dl Z d dlmZ d dlmZmZ d dlZddlm	Z	 ddl
mZ ddlmZmZmZ  ej                   e      Z	 dded	ed
ededededee   dej,                  fdZdej,                  dedededej,                  f
dZ G d de	      ZdgZy)    N)Sequence)OptionalUnion   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingn_freqsf_minf_maxn_melssample_rate
fft_lengthnormreturnc                    ||dk7  rt        d      t        j                  | t        j                        ||z  z  }dt	        j
                  d|dz  z         z  }dt	        j
                  d|dz  z         z  }	t        j                  ||	|dz         }
dd	|
dz  z  dz
  z  }|d
d |dd z
  }t        j                  |d      t        j                  |d
      z
  }t        j                  d
t        j                        }d|ddddf   z  |dd z  }|ddddf   |d
d z  }t        j                  |t        j                  ||            }|/|dk(  r*d|d|dz    |d| z
  z  }|t        j                  |d      z  }|S )a  Create a frequency bin conversion matrix (NumPy version).

    Args:
        n_freqs (int): Number of frequencies to highlight/apply
        f_min (float): Minimum frequency (Hz)
        f_max (float): Maximum frequency (Hz)
        n_mels (int): Number of mel filterbanks
        sample_rate (int): Sample rate of the audio waveform
        fft_length (int): FFT length
        norm (Optional[str]): If 'slaney', divide the triangular mel weights by
          the width of the mel band (area normalization). (Default: ``None``)

    Returns:
        np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``,
        ``n_mels``)
        meaning number of frequencies to highlight/apply to x the number of
        filterbanks.
        Each column is a filterbank so that assuming there is a matrix A of
        size (..., ``n_freqs``), the applied result would be
        ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``.
    Nslaneyz$norm must be one of None or 'slaney'dtypeg     F@      ?g     @   
      r   g      g       @)
ValueErrornparangefloat32mathlog10linspaceexpand_dimszerosmaximumminimum)r   r   r   r   r   r   r   	all_freqsm_minm_maxm_ptsf_ptsf_diffslopeszerodown_slopes	up_slopesfbenorms                      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.pycreate_fb_matrixr6      s   > DH,?@@ 		'4j8PQI TZZuu} 566ETZZuu} 566EKKufqj1EREFN+c12E12Ys#F^^E1%y!(DDF88ARZZ(D&CRC.(F3BK7Kq!"uqr
*I	D"**[)<	=BDH,uQ!,uWf~=>
bnnUA&&I    array	dimensionsizestepc                    | j                   dk7  rt        d      |dk7  r|| j                   dz
  k7  rt        d      | j                  \  }}||z
  |z  dz   }|dk  r$t        j                  |d|f| j
                        S |||f}| j                  d   | j                  d   |z  | j                  d   f}t        j                  j                  j                  | ||      S )	zNA basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim.r   zFThis unfold implementation currently supports 2D arrays (batch, time).r   r   zFThis unfold implementation only supports unfolding the last dimension.r   r   )shapestrides)
ndimr   r=   r   r&   r   r>   libstride_tricks
as_strided)	r8   r9   r:   r;   
batch_sizeoriginal_length
num_framesoutput_shapeoutput_stridess	            r5   _unfoldrH   [   s    zzQabbB9

Q6abb"'++J!D(T1A5JQxxQ-U[[AA
D1LmmA&a(84(?qAQRN66**5n*]]r7   c            #           e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d dededededed	ed
ededededededededee	e      dee	e      f  fdZ
dej                  dej                  deej                  ej                  f   fdZ	 	 	 	 	 	 d!deej                  ee   eej                     eee      f   deeeef   dee   dedee   deeeef      dee   defdZ xZS )"Gemma3nAudioFeatureExtractoraL
  An audio feature extractor Universal Speech Models https://arxiv.org/abs/2303.01037.

    Args:
        feature_size (`int`, *optional*, defaults to 128):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention mask for the generated MEL spectrograms.
        frame_length_ms (`float`, *optional*, defaults to 32.0):
            The length of a frame in milliseconds.
        hop_length_ms (`float`, *optional*, defaults to 10.0):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        min_frequency (`float`, *optional*, defaults to 125.0):
            The minimum frequency (in Hz) for the Mel filterbank.
        max_frequency (`float`, *optional*, defaults to 7600.0):
            The maximum frequency (in Hz) for the Mel filterbank.
        preemphasis (`float`, *optional*, defaults to 0.97):
            The preemphasis coefficient.
        preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
            Whether to use HTK-style preemphasis.
        fft_overdrive (`bool`, *optional*, defaults to `True`):
            Whether to use FFT overdrive.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
        input_scale_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor applied to the input waveform.
        mel_floor (`float`, *optional*, defaults to 1e-05):
            Minimum value for Mel spectrograms to avoid log(0).
        per_bin_mean (`Optional[Sequence[float]]`, *optional*):
            Mean values for per-bin normalization.
        per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
            Standard deviation values for per-bin normalization.
    input_featuresinput_features_maskfeature_sizesampling_ratepadding_valuereturn_attention_maskframe_length_mshop_length_msmin_frequencymax_frequencypreemphasispreemphasis_htk_flavorfft_overdriveditherinput_scale_factor	mel_floorper_bin_meanper_bin_stddevc           	      N   t        |   d||||d| || _        || _        |	| _        |
| _        || _        || _        || _        t        t        ||z  dz              | _        t        t        ||z  dz              | _        t        j                  |t        j                        | _        dt#        j$                  t#        j&                  | j                              z  }| j                  r|dz  }|| _        t        j*                  | j                  t        j,                        }ddt        j.                  dt        j0                  z  |z  | j                  z        z
  z  }|j3                  t        j,                        | _        t7        | j(                  dz  dz   |||| j8                  d |      | _        |,t        j                  |      j=                  dd|      | _        nd | _        |,t        j                  |      j=                  dd|      | _         y d | _         y )	N)rM   rN   rO   rP   g     @@r   r   g      ?r   )r   r   r   r   r   r   r    )!super__init__rS   rT   rU   rV   rW   rX   rY   introundframe_length
hop_lengthr   r8   float64rZ   r"   ceillog2r   r    r!   cospiastypewindowr6   rN   mel_filtersreshaper[   r\   )selfrM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   kwargsr   hann_arangerk   	__class__s                        r5   r`   z%Gemma3nAudioFeatureExtractor.__init__   s   ( 	 	
%''"7		

 	
 +*&&<#*"4mo&E&N OPeMM$AF$JKL)2::>$))DIId.?.?$@AA
!OJ$ii 1 1DBFF1ruu9{#:T=N=N#NOOPmmBJJ/+OOq(1,**!
 # " 6 > >q!\ RD $D%"$((>":"B"B1a"VD"&Dr7   waveformattention_maskr   c                 
   |j                   dk(  rt        j                  |d      }| j                  dkD  rO|| j                  t        j                  j
                  |j                   j                  |j                        z  z   }| j                  dk7  r|| j                  z  }| j                  dz   }t        |d|| j                        }| j                  dkD  r| j                  rS|dd	df   d| j                  z
  z  }|dddf   | j                  |dd	d
f   z  z
  }t        j                  ||gd      }n*|ddd	f   | j                  |dd	df   z  z
  }n	|dd	df   }|| j                   z  }t        j"                  j%                  || j&                  d      }t        j(                  |      }	t        j*                  |	| j,                        }
t        j.                  t        j0                  |
| j2                              }| j4                  || j4                  z
  }| j6                  || j6                  z  }|j9                         }|d	d	| j                     j                  t:              }||d	|j                  d    fS ) r   r   )axis        r   r   )r9   r:   r;   .Nr   )nrv   )r?   r   r%   rX   randomrandnr=   rj   r   rY   rc   rH   rd   rU   rV   concatenaterk   fftrfftr   absmatmulrl   logr'   rZ   r[   r\   squeezebool)rn   rr   rs   frame_size_for_unfoldframes_to_processfirst_in_framerest_in_frameframesstftmagnitude_specmel_speclog_mel_specmel_spectrogrammasks                 r5   _extract_spectrogramz1Gemma3nAudioFeatureExtractor._extract_spectrogram   s_   ==A~~hQ7H;;$++		0P0W0WX`XfXf0g"ggH""c)$"9"99H $ 1 1A 5 $HAV]a]l]lmc!**!237!;sTEUEU?U!V 1#qt) <t?O?ORcdgiljlildlRm?m m(GbQ*373d6F6FIZ[^`cac`c[cId6dd&sCRCx0F$++%vv{{6T__2{>99^T-=-=>vvbjj4>>BC('$*;*;;L*'$*=*==L&..000188>%?'<'<Q'? @@@r7   
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc                 V   t        |t        j                        xr t        |j                        dkD  }	t        |t
              xr# t        |d   t        j                  t
        f      }
|	xs |
}|r.|D cg c]"  }t        j                  |g      j                  $ }}n1|s/t        |t        j                        st        j                  |      }|st        j                  |g      g}| j                  t        d|i      |||||      }g }g }t        |j                  |j                        D ]c  \  }}| j                  |j                  |      \  }}|j                  |j                  t        j                                |j                  |       e t        ||d|      S c c}w )a  Creates a batch of MEL spectrograms from the provided raw speech.

        This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
        `transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
        carefully when selecting an audio feature extactor, especially with pre-trained models.

        Args:
            raw_speech:
                The audio for which MEL spectrograms are created.
            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
                The padding strategy to use for batches of audio with different lengths.
            max_length (`int`, *optional*, defaults to 480000):
                If provided, defines the maximum length of the audio to allow. Audio longer than this will be
                truncated if `truncation=True`.
            truncation (`bool`, *optional*, defaults to `True`):
                Whether or not to truncate audio above `max_length`.
            pad_to_multiple_of (`int`, *optional*, defaults to 128):
                When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
            return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
                The type of tensors to return (e.g., NumPy, Torch, JAX, TensorFlow).
            return_attention_mask (`bool`, *optional*, defaults to `True`):
                Whether to return the attention mask for the generated MEL spectrograms.
        r   r   rK   )r   r   r   r   rP   )rK   rL   )tensor_type)
isinstancer   ndarraylenr=   r   asarrayTpadr   ziprK   rs   r   appendrj   r!   )rn   r   r   r   r   r   r   rP   ro   is_batched_numpyis_batched_sequence
is_batchedrsbatched_speechprepared_speechprepared_speech_maskspeechr   s                     r5   __call__z%Gemma3nAudioFeatureExtractor.__call__  s   F &j"**=[#jFVFVBWZ[B[(X>t:jYZm^`^h^hjr]sCt%<)<
7AB"**bT*,,BJBJz2::$FJ/J**j\23J*J78!!1"7 " 
 ! = =~?\?\] 	.LFD44VXXtDLFD""6==#<= ''-	.
 .G[\&
 	
/ Cs   6'F&)   i>  rw   Tg      @@g      $@g     @_@g     @g
ףp=
?TTrw   r   gh㈵>NN)longesti S Tr   NT)__name__
__module____qualname____doc__model_input_namesra   floatr   r   r   r`   r   r   tupler   r   liststrr	   r
   r   r   __classcell__)rq   s   @r5   rJ   rJ   n   s   )V *+@A  #"&*!%#$%!'+"$'2648#B'B' B' 	B'
  $B' B' B' B' B' B' !%B' B' B' "B' B'  x/!B'" !%1#B'H+ARZZ +A +AX]^`^h^hjljtjt^tXu +A` 6?$+,/;?04B
"**d5k4

3CT$u+EVVWB
 tS/12B
 SM	B

 B
 %SMB
 !sJ!78B
  (~B
 
B
r7   rJ   )N)r"   collections.abcr   typingr   r   numpyr   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr	   r
   r   
get_loggerr   loggerra   r   r   r   r6   rH   rJ   __all__r^   r7   r5   <module>r      s      $ "  I 4 9 9 
		H	% ::: : 	:
 : : 3-: ZZ:z^2:: ^# ^S ^ ^

 ^&a
#; a
H *
*r7   