
    rh>                        d dl Z d dlmZ d dlmZmZmZ d dlZddl	m
Z
mZ  e       rd dlZ e
       rd dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ  G d
 ded      Z G d ded      Z G d de      ZdgZy)    N)Path)AnyOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   (    e Zd ZU eeeef      ed<   y)CsmAudioKwargsencoded_length_kwargsN)__name__
__module____qualname__r   dictstrr   __annotations__     y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/csm/processing_csm.pyr   r   %   s    #DcN33r   r   F)totalc                   L    e Zd ZU eed<   ddddg dg dg ddd	d
dddidZy)CsmProcessorKwargsaudio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r(   
   r   r(      r   r(      r      )r(   r(   r(   r-   r(   r(      r(   r(      r(   r(   r)   r(      )r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr"   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r!   r!   )   sG       ""'
 !QHJ#'	& #
 +D1Ir   r!   c                        e Zd ZdZddgZdZdZ	 d fd	Zedd       Z	de
d	eeeeeeef      f   d
ee   fdZ	 	 	 ddeeeeee   ee   f      dee
   dee   dee   d
ee   f
dZ xZS )CsmProcessora  
    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import CsmProcessor
        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        audio = ds[0]["audio"]["array"]

        processor = CsmProcessor.from_pretrained("sesame/csm-1b")

        processor(
            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
            audio=audio,
            text_kwargs = {"padding": False},
            audio_kwargs = {"sampling_rate": 16000},
            common_kwargs = {"return_tensors": "pt"},
        )
        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
        ```

    Args:
        feature_extractor ([`EncodecFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    feature_extractor	tokenizerEncodecFeatureExtractorPreTrainedTokenizerFastc                    t        |d      s(d| _        |j                  | j                        | _        n"|j                  | _        |j                  | _        t        |d      s(d| _        |j                  | j                        | _        n"|j                  | _        |j
                  | _        t        |   |||       y )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrrB   convert_tokens_to_idsaudio_token_idrC   audio_eos_token_idsuper__init__)selfr=   r>   rD   	__class__s       r   rJ   zCsmProcessor.__init__f   s     y-0*D"+"A"A$BRBR"SD(44D"+":":Dy"34#2D &/&E&EdFZFZ&[D##,#<#<D &/&B&BD#*I]Sr   c                 >   | }|||||S t        |||      D ]  \  }}}|dz
  |z  dz   }	||z
  }
|
dz  }|
|z
  }||	z
  |
z   |z  dz   }t        j                  |      dz
  }||z  |z   |
z
  }||z
  }|r|
}|}n|}||z   }||z   |z   }|||dz
  z  z
  dz
  |z  dz   } |S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
            strides (list[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        r(   r0   )zipmathceil)audio_lengthr1   r2   r3   r4   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddings                   r   _get_encoded_lengthz CsmProcessor._get_encoded_length|   s    "
7?i6G?Kb-0w	-R 	W)K%01_$@1$D!'&0M)Q.M(=8L"%::]JfTWXXHyy*Q.H#f,{:]JL(:5M, -+ - =#l2]BJ$x;?'CCaGFRUVVJ'	W* r   audiosaving_pathkwargsc                 F   t               st        d      t        |      }t        |t        t
        f      r|g}n3t        |t        t        f      rt        d |D              st        d      t        |      t        |      k7  rt        d       | j                  t        fi |}|d   }|d   }t        ||      D ]b  \  }}t        |t        j                        r,|j!                         j#                         j%                         }t'        j(                  |||       d y )Nz/Please install `soundfile` to save audio files.c              3   H   K   | ]  }t        |t        t        f        y wN)
isinstancer   r   ).0ps     r   	<genexpr>z*CsmProcessor.save_audio.<locals>.<genexpr>   s     @q`aAPSUY{A[@qs    "zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer"   r5   )r   ImportErrorr   rd   r   r   listtupleall
ValueErrorlen_merge_kwargsr!   rN   torchTensorcpufloatnumpysfwrite)	rK   r^   r_   r`   output_kwargsr"   r5   audio_valuerf   s	            r   
save_audiozCsmProcessor.save_audio   s    &'OPP #5) kC;/&-K[4-8S@qep@q=q`aau:[))TUU***

 %^4$_5!%5 	4NK+u||4)oo/557==?HHQ]3	4r   textoutput_labelsdepth_decoder_labels_ratioc                 h
    | j                   t        fd| j                  j                  i|}|d   }|d   }|d   }	|	j	                  dd      }
|
dk7  r"t        | j                  j                   d      t        |t              r|g}n3t        |t        t        f      rt        d	 |D              st        d
      |D cg c]  }|j                  | j                         }}d}|t        |      }t!        |      }t#        |      dkD  r-|t#        |      k7  r|t        d      t        d| d| d      ||j	                  di       }|D cg c]"  } | j$                  |j&                  d   fi |$ }}|j)                         }g }|D ]  }g }| j                  |v r]|j	                  d      }| j                  |z  }|j+                  |       |j-                  | j                  dd      }| j                  |v r]d|v r'|j-                  d|j	                  d      d      }d|v r'|j+                  |        |} | j                  |fi |}i }|j/                  |       ||j	                  dd       g g }}d}|D ]  }|dk(  rJ|j+                  t1        j2                  d             |j+                  t5        j6                  dg             S|j+                  t1        j8                  ||||z    D cg c]<  }t        |t4        j:                        r|j=                         j?                         n|> c}d             |j+                  t5        j6                  ||||z    D cg c]  }|j&                  d    c}      jA                  d             ||z  }!  | jB                  |fi |}|j	                  dd       |j/                  |       tE        d |D              }|D cg c]@  }t4        jF                  jH                  jK                  |d||j&                  d   z
  fd      B }}t5        jL                  |d      |d<   |r|d   | jN                  k(  jQ                         } | j&                  d   }!|dk  r-t5        jR                  |!      dtU        |!d|z
  z         }"| |"   }#n| }#t5        jV                  |d   | jN                  k(  |d   | jX                  k(  z  |d   d      }$d|$|#dddf   |#dddf   f<   |$|d<   t[        ||
       S c c}w c c}w c c}w c c}w c c}w )!a  
        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
        the text. To prepare the audio, this method forwards the `audio` arguments to
        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
        to the docstring of the above two methods for more information.

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
                tensor.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
                - `-100` will be ignored in the loss computation
                - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
            depth_decoder_labels_ratio (float, *optional*, default=1.0):
                The ratio of audio frames to keep for the depth decoder labels.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        tokenizer_init_kwargsr8   r"   r9   r6   Nr7   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wrc   )rd   r   )re   ts     r   rg   z(CsmProcessor.__call__.<locals>.<genexpr>  s     9[QR*Q:L9[s   zAInvalid input text. Please provide a string, or a list of stringsr   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   z<placeholder>r(   return_attention_mask)axis)dimpadding_maskc              3   :   K   | ]  }|j                   d      yw)r   N)shape)re   cut_idxss     r   rg   z(CsmProcessor.__call__.<locals>.<genexpr>O  s     R(..,Rs   )valueinput_values_cutoffs	input_ids      ?iilabels)datatensor_type).rn   r!   r>   init_kwargspoprl   rL   r   rd   r   ri   rj   rk   countrB   r   rm   sumr]   r   copyappendreplaceupdatenpzerosro   tensorconcatenaterp   rq   rs   cumsumr=   maxnn
functionalpadstackrG   nonzerorandpermintwhererH   r   )%rK   ry   r^   rz   r{   r`   rv   r8   r"   r9   r6   r   n_audio_in_textn_audior   audio_arraynum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetelaudio_inputsmax_lenr   audio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   s%                                        r   __call__zCsmProcessor.__call__   s   ^ +**
"&.."<"<
 
 $M2$^4%o6&**+;TBT! 7 788]^__dC 6DTD%=1c9[VZ9[6[`aa>BC1774#3#34CC&u-E%jG!#33G(G} !cdd ??P Q229"> 
 $0$4$45Lb$Q!lq%]h((():):2)>XBWX%! % *?)C)C)E& M - &&&0'A'E'Ea'H$+/+;+;>N+N(&&';<#^^D,<,<oqQF &&&0 &/#^^O[__Q=OQRSF &/$$V,- !D!4>>$6+6H4d;792 4F* &a<&--bhhqk:(//bT0BC&-- +0'9I*J$& 5?r5<<4P 0VX X "$ )//U6FU\L\=]%^rbhhrl%^_ffkmfn g%F#&& 24112DUUL^T2KK% R=QRRG !5$ ##''1gr@R6R2S[]'^$  $ ,1;;7KQR+SD'( $[ 1T5H5H HQQS-33A6N)S0!NN>:;sSSTWqSqAr=st	#3I#> #3 [[k"d&9&99d;>OSWSjSj>jk[!F
 FJF#AqD)+;AqD+AAB#DN>BBG D$%L &_$s    "T'T :AT%,T*1AT/rc   )NNNN)NFr   )r   r   r   __doc__
attributesfeature_extractor_classtokenizer_classrJ   staticmethodr]   r
   r   r   r   ri   r   r!   rx   r   r   r   boolrr   r   __classcell__)rL   s   @r   r<   r<   >   s    !F &{3J7/O 	T, $ $L 4 4 3d5d+;&<<= 4 +,	 4J '+(-69dCuY(94	?DQbLccdedC 
#dC  ~	dC
 %-UOdC +,dCr   r<   ) rO   pathlibr   typingr   r   r   rs   r   utilsr   r	   ro   	soundfilert   audio_utilsr
   r   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r!   r<   __all__r   r   r   <module>r      st       ' '  ?  9 4 U U C4[ 4) *kC> kC\	 
r   