
    rhO                         d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZmZ dd	lmZmZ  e       rddlZ e       rddlZ G d
 ded      Z G d ded      Z G d de      ZdgZy)zProcessor class for Dia    N)Path)OptionalUnion   )
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)is_soundfile_availableis_torch_availablec                   F    e Zd ZU eed<   eed<   eed<   ee   ed<   eed<   y)DiaAudioKwargsbos_token_ideos_token_idpad_token_iddelay_pattern
generationN)__name__
__module____qualname__int__annotations__listbool     y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.pyr   r   "   s$    9r   r   F)totalc                   B    e Zd ZU eed<   dddddddg d	dd
dddidZy)DiaProcessorKwargsaudio_kwargsTrightF)paddingpadding_sideadd_special_tokensi   i  i  )	r      	   
                  iD  )r   r   r   r   r   sampling_ratereturn_tensorspt)text_kwargsr$   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r    r#   r#   *   sB       #"'
 !  >"
 +D1Ir   r#   c                       e Zd ZdZdZdZdZ fdZed        Z		 	 d!de
eee   f   dee   d	ee   d
ee   fdZ	 d"dddee   d
ee   ded   fdZ	 d"dddee   d
ee   ddfdZddd
ee   defdZdede
eeee
eef      f   d
ee   fdZe	 d#dedededee   deded   fd       Zedddededed   ddf
d        Z xZS )$DiaProcessora  
    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
    information.

    Args:
        feature_extractor (`DiaFeatureExtractor`):
            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`DiaTokenizer`):
            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
        audio_tokenizer (`DacModel`):
            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
    DiaFeatureExtractorDiaTokenizerDacModelc                 *    t         |   |||       y )N)audio_tokenizer)super__init__)selffeature_extractor	tokenizerr=   	__class__s       r    r?   zDiaProcessor.__init__R   s    *IWr   c                 x    | j                   j                  }ddg}t        t        j	                  ||z               S )z
        We no longer pass the raw audio values but the codebooks encoded by the `audio_tokenizer`.
        Conventions may differ between audio models due to architectural choices.
        decoder_input_idsdecoder_attention_mask)rB   model_input_namesr   dictfromkeys)r@   tokenizer_input_namesaudio_tokenizer_input_namess      r    rG   zDiaProcessor.model_input_namesU   s;     !% @ @':<T&U#DMM"7:U"UVWWr   textaudiooutput_labelskwargsc           
      j   t               st        d      |t        d       | j                  t        fi |}|d   }|d   }|d   }|j	                  dd      }	|	dk7  r"t        | j
                  j                   d	      i }
t        |t              r|g}n3t        |t        t        f      rt        d
 |D              st        d       | j                  |fi |}|
j                  |       |j	                  dd      }|j	                  dd      }|j	                  dd      }|j	                  dd      }|j	                  dd      }||||t        d      |r|rt        d| d| d      |
d   j                  d   }t        |      }t!        |      }|Qt#        |      } | j$                  |fi |}t'        j(                  | j*                  j,                  j.                        }|d   d   j                  d   |z  }g }g }t1        |d   |d         D ]  \  }}| j$                  j2                  }t'        j4                  |j7                  d      |z        |z  }||z  }||z
  }t9        j:                         5  |ddd|f   j=                  | j*                  j>                        }| j*                  jA                  |      jB                  jE                  dd      }ddd       |s-t8        jF                  jH                  jK                  dd |!      }t8        jF                  jH                  jK                  dd|dz   dddfd |!      }|dz   |z   }||rdndz  }t9        jL                  dg|z  dg|z  z   t8        jN                  "      dddf   } |jQ                  |       |jQ                  |         t9        jR                  |d      }t9        jR                  |d      }na|rTt9        jT                  |d|f|t8        jN                  "      }t9        jV                  |d|z   ft8        jN                  #      }nt        d$      ||j                  d   k7  rt        d%| d&|j                  d    d'      |j                  d   }!|!|z
  }"| jY                  ||!||d()      }#t9        jT                  ||!|f|t8        jZ                  *      }$||$ddd|"f<   | j]                  |$|||#+      }%|
j                  |%|d,       |r|
d-   j_                         ddddf   }&d.|&|&|k(  <   d.|&|&|k(  <   |&jE                  dd      ja                  ||z  d      jc                         jO                         |
d/<   |
d-   ddddf   |
d-<   |
d0   ddddf   |
d0<   te        |
|	1      S # 1 sw Y   xY w)2a  
        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
        to the docstring of the above methods for more information.
        zThe `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't find it in your environment. You can install torch via `pip install torch`.Nz0You need to specify the `text` input to process.r4   r$   r5   r2   r3   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wN)
isinstancestr).0ts     r    	<genexpr>z(DiaProcessor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[s   zAInvalid input text. Please provide a string, or a list of stringsr   r   r   r   r   TzTo enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.z9Labels with `generation` is incompatible, got generation=z, output_labels=.	input_idsr   padding_maskinput_valuesdim.      )r   r   r   r_   r   r   constant)padmodevaluedtype)sizerf   z;If you try to train, you should provide audio data as well.zNNeed the same amount of samples for both text and audio, but got text samples=z and audio samples = z	 instead.Fbszseq_lennum_channelsr   revert)
fill_valuerf   rM   r   r   precomputed_idx)rE   rF   rE   ilabelsrF   )datatensor_type)3r   
ValueError_merge_kwargsr#   poprC   r   rS   rT   r   tupleallrB   updateshapelenmaxr   rA   mathprodr=   configdownsampling_ratioszip
hop_lengthceilsumtorchno_gradtodeviceencodeaudio_codes	transposenn
functionalrb   tensorlongappendcatfullonesbuild_indicesr   apply_audio_delayclonereshape
contiguousr	   )'r@   rL   rM   rN   rO   output_kwargsr4   r$   r5   r2   rq   	encodingsr   audio_bos_token_idaudio_eos_token_idaudio_pad_token_idr   
batch_sizerk   	max_delayinput_audioscompression_ratemax_encoded_sequence_lenrE   rF   rZ   base_pad_lencurrent_audio_lenencoded_sequence_lenpadding_lenrY   num_valid_inputsattention_maskmax_seq_lenmax_audio_lenro   prefilldelayed_decoder_input_idsrp   s'                                          r    __call__zDiaProcessor.__call___   sm    "#^ 
 <OPP***


 $M2$^4%o6&**+;TBT! 7 788]^__ dC 6DTD%=1c9[VZ9[6[`aa"DNN47;7	I %(($?)--ndC)--ndC)--ndC!%%lD9
&!)!)$k 
 -KJ<Wghugvvwx  +&,,Q/
=)&	 &u-E1411%H<HL#yy)=)=)D)D)X)XY'3N'CA'F'L'LR'PTd'd$ "%'" (+<+GVdIe'f >#e#55@@$(IIl.>.>2.>.F.U$VYe$e!'8<L'L$69MM ]]_ _!$-?.?-?"?@CCDDXDXD_D_`E $ 4 4 ; ;E B N N X XYZ\] ^I_ " % 3 3 7 7!'9
Rd !8 !I "HH//33Aq+/1a#C*\n 4 	 $8!#;i#G  A: !&qcK.?1#HXBX.X`e`j`j!klprsls!t!((3&--n=9>< !&		*; C%*YY/E1%M" %

J<+HJ\didndn o &+ZZj!i-5PX]XbXb%c"Z[[*0033`ak`l m##4#:#:1#=">iI  -2226#i/,,%' - 
 **l3)))

 &7>M>!"$($:$:+++	 %; %
! 	*C_uvw-.446q!"u=F37F6//037F6//0#--a3;;J<UWYZeegllnDN(,-@(A!SbS&(ID$%-12J-KAsPRsF-SD)*>BB]_ _s   A#V((V2	rE   torch.Tensoraudio_prompt_lenreturnc                 Z    | j                   t        fi |}|d   }|j                  dd      }|j                  dd      }|j                  dd      }|||t        d      |Rt	        j
                  ||j                  t        j                        }|d   j                  |j                  d         }	n|dddddf   |k(  j                  d	
      }	|j                  d   |dddddf   |k(  j                  d	
      z
  dz
  }
|j                  \  }}}| j                  ||||d      }| j                  |d	d	|      j                  dd      }g }t	        j                         5  t        |	j                  d         D ]  }||dd|	|   |
|   f   d   }|j!                  | j"                  j                        }| j"                  j%                  |      j&                  j)                         j+                         }|j-                  |        	 ddd       |S # 1 sw Y   |S xY w)a  
        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
        `audio_tokenizer`. See [`~DacModel.decode`] for more information.

        Args:
            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
        r$   r   Nr   r   zTo enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.)r   rf   r   r[   r]   r_   Trh   rn   r`   )N.)r   )rt   r#   ru   rs   r   r   r   r   expandry   r   r   r   r   r   ranger   r=   decodeaudio_valuescpusqueezer   )r@   rE   r   rO   r   r$   r   r   r   start_of_generation_idxend_of_generation_idxri   rj   rk   ro   output_sequencesaudiosioutput_iaudio_is                       r    batch_decodezDiaProcessor.batch_decode  sx    +**

 %^4$(($?)--ndC)--ndC%);)C}G\[  '$||,<EVE]E]ejeoeop&6t&<&C&CDUD[D[\]D^&_#'8Aq'AEW'W&\&\ac&\&d# ##A&*;Aq!G*DHZ*Z)_)_df)_)ggjkk 	
 &7%<%<"Wl,,%' - 
  11# + 2 
 )Aq/ 	 ]]_ 	'288;< '+Aq2I!2LOdefOg2g,ghirs#;;t';';'B'BC..55(5KXX\\^ffhg&	'	' 	' s   *B+H  H*c                     |j                   d   dk7  rt        d|j                   d    d       | j                  ||fi |d   S )z
        Decodes a single sequence of audio codebooks into the respective audio waveform via the
        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
        r   r_   z5Expecting a single output to be decoded but received z samples instead.)ry   rs   r   )r@   rE   r   rO   s       r    r   zDiaProcessor.decodeS  sc     ""1%*GHYH_H_`aHbGcctu  !t  !24DOOPQRRr   rF   c                      | j                   t        fi |}|d   }|j                  dd      }|t        d      |j                  d   t        |      z
  S )z0Utility function to get the audio prompt length.r$   r   NzTo enable the utility of retrieving the prompt length for Dia, we need the `delay_pattern`. You may have accidentally overwritten this.r_   )rt   r#   ru   rs   ry   r{   )r@   rF   rO   r   r$   r   s         r    get_audio_prompt_lenz!DiaProcessor.get_audio_prompt_lend  su     +**

 %^4$(($? O  &++A.]1CCCr   saving_pathc                 F   t               st        d      t        |      }t        |t        t
        f      r|g}n3t        |t        t        f      rt        d |D              st        d      t        |      t        |      k7  rt        d       | j                  t        fi |}|d   }|d   }t        ||      D ]b  \  }}t        |t        j                        r,|j!                         j#                         j%                         }t'        j(                  |||       d y )Nz/Please install `soundfile` to save audio files.c              3   H   K   | ]  }t        |t        t        f        y wrR   )rS   rT   r   )rU   ps     r    rW   z*DiaProcessor.save_audio.<locals>.<genexpr>  s     @q`aAPSUY{A[@qs    "zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer$   r1   )r   ImportErrorr   rS   rT   r   r   rv   rw   rs   rz   rt   r#   r   r   Tensorr   floatnumpysfwrite)	r@   rM   r   rO   r   r$   r1   audio_valuer   s	            r    
save_audiozDiaProcessor.save_audioy  s    &'OPP #5) kC;/&-K[4-8S@qep@q=q`aau:[))TUU***

 %^4$_5!%5 	4NK+u||4)oo/557==?HHQ]3	4r   ri   rj   rk   r   rl   )r   r   c                    t        j                  |t         j                        }t        j                  |t         j                        dddf   j	                  | |      d   }|s||ddddf   z
  }n||ddddf   z   }t        j
                  |d|dz
        }t        j                  | t         j                        ddddf   j	                  | ||      }t        j                  |t         j                        ddddf   j	                  | ||      }	t        j                  |j                  d      |j                  d      |	j                  d      gd      j                         }
||
fS )a  
        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
        re   N).Nr   r_   r[   r]   )	r   r   int32aranger   clampstackr   r   )ri   rj   rk   r   rl   delay_arraysequence_idxvalid_sequence_idx	batch_idxchannel_idxall_idxs              r    r   zDiaProcessor.build_indices  sI    ll=D ||G5;;?aHOOPSU\]^gh'+dD!m*DDL'+dD!m*DDL"[[q'A+FLLEKK8D$GNNsT[]ij	ll<u{{CD$PQMRYYZ]_fhtu++r"$6$>$>r$BKDWDWXZD[\
 $& 	
 W$$r   r   r   ro   c           	      r   | j                   }|\  }}|j                  |      }|j                  |      }t        j                  |d      \  }}}	| |||	f   j	                  | j                               }
|dk  }|| j                  d   k\  }t        j                  ||t        j                  |||
            }|S )a  
        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.

        Args:
            audio: audio tokens of shape [bsz, seq_len, num_channels]
            pad_token_id: the PAD token
            bos_token_id: the BOS token
            precomputed_idx: from `build_indices`

        Returns:
            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
        r[   r]   r   r_   )r   r   r   unbindviewrg   ry   where)rM   r   r   ro   r   r   r   r   r   r   gathered_audiomask_bosmask_padfinal_audios                 r    r   zDiaProcessor.apply_audio_delay  s    *  /g#v.**V$ 6;\\'r5R2	%{y*<kIJOOPUPZPZP\]  !#5;;q>1kk(L%++hP\^l:mnr   )NFrR   )F)r   r   r   __doc__feature_extractor_classtokenizer_classaudio_tokenizer_classr?   propertyrG   r   rT   r   r   r   r   r   r#   r   r   r   r   r   r   r   staticmethodrv   r   r   __classcell__)rC   s   @r    r8   r8   >   s&    4$O&X X X '+(-	kCCcN#kC 
#kC  ~	kC
 +,kC` +/E)E #3-E +,	E
 
n	ET +/S)S #3-S +,	S
 
S"D .D +,D 
	D* 4 4 3d5d+;&<<= 4 +,	 4D   % % %  % Cy	 %
  % 
-	. %  %D """ " =>	"
 
" "r   r8   )r   r|   pathlibr   typingr   r   audio_utilsr   r   feature_extraction_utilsr	   processing_utilsr
   r   r   r   utilsr   r   r   	soundfiler   r   r#   r8   __all__r   r   r    <module>r      sl       " 9 4 U U ? [ ) (c> cL 
r   