
    rh                        d Z ddlZddlmZ ddlmZmZmZ ddlZ	ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z   ejB                  e"      Z#e ed       G d de                    Z$e ed       G d de                    Z%e
jL                  jN                  d        Z(	 	 	 	 	 d@dZ)d Z* G d de
j                  jV                        Z, G d dejV                        Z- G d d ejV                        Z. G d! d"ejV                        Z/ G d# d$ejV                        Z0 G d% d&ejV                        Z1 G d' d(ejV                        Z2 G d) d*ejV                        Z3 G d+ d,ejV                        Z4 G d- d.ejV                        Z5 G d/ d0ejV                        Z6 G d1 d2ejV                        Z7 G d3 d4ejV                        Z8 G d5 d6e      Z9 G d7 d8ejV                        Z: G d9 d:ejV                        Z;e G d; d<e             Z< ed=       G d> d?e<             Z=d?d<gZ>y)AzPyTorch VITS model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringlogging   )
VitsConfigz`
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)VitsModelOutputa"  
    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        The final audio waveform predicted by the model.
    sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
        The length in samples of each element in the `waveform` batch.
    spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
        The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
        GAN decoder model to obtain the final audio waveform.
    Nwaveformsequence_lengthsspectrogramhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler   r        y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/vits/modeling_vits.pyr   r   (   s     -1Hhu(()048hu00186:K% 1 123:8<M8E%"3"345<59Ju00129r&   r   zm
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)VitsTextEncoderOutputa  
    prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted mean values of the prior distribution for the latent text variables.
    prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted log-variance values of the prior distribution for the latent text variables.
    Nlast_hidden_stateprior_meansprior_log_variancesr   r   )r   r   r   r    r*   r   r!   r"   r#   r+   r,   r   r$   r   r%   r&   r'   r)   r)   @   s~     6:x 1 129/3K%++,37;%"3"34;8<M8E%"3"345<59Ju00129r&   r)   c                     | |z   }t        j                  |d d d |d d f         }t        j                  |d d |d d d f         }||z  }|S N)r!   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r'   fused_add_tanh_sigmoid_multiplyr8   U   sT    wFJJva,123EMM&LM1!456E5=DKr&   c	                    | | k\  | |k  z  }	|	 }
t        j                  |       }t        j                  |       }t        j                  t        j                  d|z
        dz
        }t
        j                  j                  |d      }||d<   ||d<   | |
   ||
<   d||
<   t        | |	   ||	ddf   ||	ddf   ||	ddf   |||||	      \  ||	<   ||	<   ||fS )	a	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r!   
zeros_likenplogexpr   
functionalr:   _rational_quadratic_spline)r?   r@   rA   rB   rC   rD   rE   rF   rG   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r'   (_unconstrained_rational_quadratic_splinerS   ^   s   \ #zk1f
6JK11v&G""6*KvvbffQ/0145H!}}001Iv0V'/V$(0W%%+,A%BG!"),K%&Ga*+/0Da0GH12F2IJ!9:NPQ:Q!R#%%
HDG !;/C#D Kr&   c	                    |}	| }
t        j                  |       |
k  st        j                  |       |	kD  rt        d      |j                  d   }||z  dkD  rt        d| d|       ||z  dkD  rt        d| d|       t
        j                  j                  |d      }|d||z  z
  |z  z   }t        j                  |d      }t
        j                  j                  |d	d
d      }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf   |dddf   z
  }|t
        j                  j                  |      z   }t
        j                  j                  |d      }|d||z  z
  |z  z   }t        j                  |d      }t
        j                  j                  |d	d
d      }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf   |dddf   z
  }|r|n|}|dxx   dz  cc<   t        j                  | d   |k\  d      dz
  }|d   }|j                  d|      d   }|j                  d|      d   }|j                  d|      d   }||z  }|j                  d|      d   }|j                  d|      d   }|dddf   j                  d|      d   }|j                  d|      d   }||z   d|z  z
  }|s| |z
  |z  }|d|z
  z  }|||j                  d      z  ||z  z   z  }|||z  z   }|||z  z   } |j                  d      ||j                  d      z  d|z  |z  z   |d|z
  j                  d      z  z   z  }!t        j                  |!      dt        j                  |      z  z
  }"| |"fS | |z
  }#|#|z  }$|||z
  z  |$z   }%||z  |$z
  }&| |#z  }'|&j                  d      d|%z  |'z  z
  }(|(dk\  j                         st!        d|(       d|'z  |& t        j"                  |(      z
  z  })|)|z  |z   } |)d|)z
  z  }|||z  z   }|j                  d      ||)j                  d      z  d|z  |z  z   |d|)z
  j                  d      z  z   z  }!t        j                  |!      dt        j                  |      z  z
  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr=         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rR   r>   )r:   modevaluer;   r<   .Ngư>).N      r   zinvalid discriminant )r!   minmax
ValueErrorshaper   rL   softmaxcumsumr:   softplussumgatherpowrJ   allRuntimeErrorsqrt)*r?   r@   rA   rB   rC   rD   rE   rF   rG   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrP   derivative_numeratorrQ   intermediate2intermediate3abcdiscriminantroots*                                             r'   rM   rM      sR   X K+Kyy;&%))F*;k*IHII"((,Hx#%-m_<^_g^hijj 3&.~.>>`ai`jkll]]""#6B"?Fa-("::fDDFV,I!!)jPS!TI{*i7+EI#If$IgsABw)C"H"55F 2==#9#9:R#SSKmm##$8b#AGNX$= =HHGg2.J"":6
RU"VJ+z9KGJ$Jv%Jwab!JsCRCx$88G")JyM'd"iiy)]:CaGGi G&&r73F;O}}R1&9!((W5f=fE,,r7+F3K#**2w7?!,S!"W!5!<!<R!I&!QNN2w/7M%(BBQ_TM/)-== %U 3![599Q<%?BSVkBk%kl	!M4I$II"Y%<<*q1&15+o 5561u9//!"445 

 ii 45EIIk<R8RR## !11%5[+<<=M--=L=(uuQx!a%!)+!&&(!6|nEFFA1"uzz,778))O; $D 1!M4I$II*q1&!4+o 5561t8.."334 

 ii 45EIIk<R8RR$$r&   c                   6     e Zd Zdedef fdZddZd Z xZS )VitsWaveNetconfig
num_layersc                    t         |           |j                  | _        || _        t        j
                  j                         | _        t        j
                  j                         | _        t        j                  |j                        | _        t        t
        j                  j                  d      r%t
        j                  j                  j                  }nt
        j                  j                  }|j                   dk7  rJt        j
                  j#                  |j                   d|j                  z  |z  d      } ||d      | _        t'        |      D ]  }|j(                  |z  }|j*                  |z  |z
  dz  }t        j
                  j#                  |j                  d|j                  z  |j*                  ||      } ||d      }| j                  j-                  |       ||dz
  k  rd|j                  z  }	n|j                  }	t        j
                  j#                  |j                  |	d      }
 ||
d      }
| j                  j-                  |
        y )Nweight_normr   rZ   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r!   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r'   r   zVitsWaveNet.__init__1  s   !--$,,.$xx224zz&"8"89288,,m<((33??K((..K((A-)F)FFL^L^H^akHkmnoJ)*8DDOz" 	8A33Q6H11H<xGAMGxx"..!3!33"66! ' H #8(;HNN!!(+ :>!$%(:(:$:!$*$6$6!"XX__V-?-?ARTUVN(hGN  ''7+	8r&   c                    t        j                  |      }t        j                  | j                  g      }|| j	                  |      }t        | j                        D ]  } | j                  |   |      }|1|dz  | j                  z  }|d d ||d| j                  z  z   d d f   }	nt        j                  |      }	t        ||	|d         }
| j                  |
      }
 | j                  |   |
      }|| j                  dz
  k  r<|d d d | j                  d d f   }||z   |z  }||d d | j                  d d d f   z   }||z   } ||z  S )NrZ   r   r   )r!   rH   	IntTensorr   r   r   r   r   r8   r   r   )r   r?   padding_maskglobal_conditioningrP   num_channels_tensorr   r   cond_offsetglobal_statesr7   res_skip_actsres_actss                r'   forwardzVitsWaveNet.forwardZ  sm   ""6*#oot/?/?.@A*"&//2E"Ft' 	2A-DNN1-f5M".!ed&6&66 3A{[STW[WgWgSgEg7gij4j k % 0 0 ?2=-QdefQghD<<%D3D003D9M4??Q&&(,>d.>.>,>)AB 8+|;!M!T5E5E5G2J$KK!M1%	2( %%r&   c                 p   | j                   dk7  r3t        j                  j                  j	                  | j
                         | j                  D ]+  }t        j                  j                  j	                  |       - | j                  D ]+  }t        j                  j                  j	                  |       - y )Nr   )r   r!   r   r   remove_weight_normr   r   r   r   layers     r'   r   zVitsWaveNet.remove_weight_normw  s~    &&!+HHNN--doo>^^ 	5EHHNN--e4	5)) 	5EHHNN--e4	5r&   r.   )	r   r   r   r   intr   r   r   __classcell__r   s   @r'   r   r   0  s!    '8z '8s '8R&:5r&   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsPosteriorEncoderr   c                 B   t         |           |j                  | _        t	        j
                  |j                  |j                  d      | _        t        ||j                        | _        t	        j
                  |j                  | j                  dz  d      | _        y )Nr   r   rZ   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r'   r   zVitsPosteriorEncoder.__init__  su    ",,		&"9"96;M;MqQ"6f6a6ab6#5#5t7H7H17LaPr&   c                 .   | j                  |      |z  }| j                  |||      }| j                  |      |z  }t        j                  || j
                  d      \  }}|t        j                  |      t        j                  |      z  z   |z  }|||fS )Nr   rV   )r   r   r   r!   splitr   
randn_likerK   )r   r?   r   r   statsmean
log_stddevsampleds           r'   r   zVitsPosteriorEncoder.forward  s    v&5fl4GHv&5 ;;ud.?.?QGj%**40599Z3HHHLXj((r&   r.   r   r   r   r   r   r   r   r   s   @r'   r   r     s    Qz Q)r&   r   c                   :     e Zd Zd fd	ZddZd Zd Zd Z xZS )HifiGanResidualBlockc                    t         |           || _        t        j                  t        t        |            D cg c]3  }t        j                  |||d||   | j                  |||               5 c}      | _	        t        j                  t        t        |            D cg c]-  }t        j                  |||dd| j                  |d            / c}      | _
        y c c}w c c}w )Nr   )strider   r   )r   r   leaky_relu_sloper   r   r   lenr   get_paddingconvs1convs2)r   channelsr   r   r   r   _r   s          r'   r   zHifiGanResidualBlock.__init__  s     0mm s8}-
  		%a[ ,,[(1+F

 mm s8}-
  		 ,,[!<



s   8C$%2C)c                     ||z  |z
  dz  S )NrZ   r%   )r   r   r   s      r'   r   z HifiGanResidualBlock.get_padding  s    h&1a77r&   c                 ,   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]
  } ||        y Nr   )r   r   r   r   r   r   r   r   r   r   s      r'   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  sp    hh**288,,m<((33??K[[ 	E	[[ 	E	r&   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]!  }t        j                  j                  |       # y r.   )r   r   r   r   r   r   s     r'   r   z'HifiGanResidualBlock.remove_weight_norm  sL    [[ 	/EHH''.	/[[ 	/EHH''.	/r&   c                 ,   t        | j                  | j                        D ]p  \  }}|}t        j                  j                  || j                        } ||      }t        j                  j                  || j                        } ||      }||z   }r |S r.   )zipr   r   r   rL   
leaky_relur   )r   r   conv1conv2residuals        r'   r   zHifiGanResidualBlock.forward  s    T[[9 	5LE5$HMM44]DDYDYZM!-0MMM44]DDYDYZM!-0M)H4M	5 r&   )r   )r   r      g?r   )	r   r   r   r   r   r   r   r   r   r   s   @r'   r   r     s    
>8/r&   r   c                        e Zd Zdef fdZd Zd Z	 d	dej                  de	ej                     dej                  fdZ
 xZS )
VitsHifiGanr   c                 d   t         |           || _        t        |j                        | _        t        |j                        | _        t        j                  |j                  |j                  ddd      | _        t        j                         | _        t        t!        |j                  |j"                              D ]d  \  }\  }}| j                  j%                  t        j&                  |j                  d|z  z  |j                  d|dz   z  z  ||||z
  dz               f t        j                         | _        t+        t        | j                              D ]p  }|j                  d|dz   z  z  }t!        |j                  |j,                        D ]6  \  }}| j(                  j%                  t/        ||||j0                               8 r t        j                  ddddd      | _        |j4                  dk7  r1t        j                  |j4                  |j                  d      | _        y y )	N   r   r   )r   r   r   rZ   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r'   r   zVitsHifiGan.__init__  s   v;;< !6!67		++
 /8V=R=RTZTpTp9q/r 		+A+{NN!!""331=33a!eE +((=8Q>		 s4>>*+ 	vA661Q<HH),V-I-I6KiKi)j v%X%%&:8[RZ\b\s\s&tuv	v
 8QAaQRY^_((A-		&"?"?A`A`bcdDI .r&   c                 <   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]  }|j                           y r   )r   r   r   r   r   r   r   r   r   s      r'   r   zVitsHifiGan.apply_weight_norm  st    hh**288,,m<((33??K^^ 	E	^^ 	&E##%	&r&   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]  }|j                           y r.   )r   r   r   r   r   r   s     r'   r   zVitsHifiGan.remove_weight_norm   sF    ^^ 	/EHH''.	/^^ 	'E$$&	'r&   r   r   returnc                    | j                  |      }||| j                  |      z   }t        | j                        D ]  }t        j
                  j                  || j                  j                        } | j                  |   |      } | j                  || j                  z     |      }t        d| j                        D ]*  }| | j                  || j                  z  |z      |      z  }, || j                  z  } t        j
                  j                  |      }| j                  |      }t        j                  |      }|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        r   )r   r   r   r   r   rL   r   r   r   r   r   r   r   r!   r/   )r   r   r   r   r   	res_statejr   s           r'   r   zVitsHifiGan.forward  s+    k2*)DII6I,JJMt))* 	9AMM44]DKKD`D`aM-DNN1-m<M<q4+;+;';<]KI1d../ UET^^A0@0@,@1,DEmTT	U%(8(88M	9 00?}5::m,r&   r.   )r   r   r   r   r   r   r   r!   r"   r   r   r   r   s   @r'   r   r     sW    "ez "eH&' bf  ,, CKEL]L]C^ 			 r&   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingLayerr   c                 B   t         |           |j                  dz  | _        t	        j
                  | j                  |j                  d      | _        t        ||j                        | _
        t	        j
                  |j                  | j                  d      | _        y )NrZ   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r   r   s     r'   r   z"VitsResidualCouplingLayer.__init__*  su    #--2		$"4"4f6H6H!L"6f6]6]^6#5#5t7I7I1Mr&   c                    t        j                  || j                  gdz  d      \  }}| j                  |      |z  }| j	                  |||      }| j                  |      |z  }t        j                  |      }	|sS||t        j                  |	      z  |z  z   }t        j                  ||gd      }
t        j                  |	ddg      }|
|fS ||z
  t        j                  |	       z  |z  }t        j                  ||gd      }
|
d fS )NrZ   r   rV   )
r!   r   r  r   r   r   rH   rK   catrc   )r   r?   r   r   rC   
first_halfsecond_halfr   r   r   rP   log_determinants               r'   r   z!VitsResidualCouplingLayer.forward2  s   "'++ft7I7I6JQ6NTU"V
Kj1L@]LBUV~~m,|;%%d+
uyy/D!D|!SSKii[ 9qAG#ii
QF;OO++&-J;1GG,VKii[ 9qAGD= r&   NFr   r   s   @r'   r  r  )  s    Nz N!r&   r  c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingBlockr   c                     t         |           t        j                         | _        t        |j                        D ]&  }| j                  j                  t        |             ( y r.   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  r   r   r   r   s      r'   r   z"VitsResidualCouplingBlock.__init__E  sO    ]]_
v556 	AAJJ7?@	Ar&   c                     |s7| j                   D ]&  } ||||      \  }}t        j                  |dg      }( |S t        | j                         D ](  }t        j                  |dg      } ||||d      \  }}* |S )Nr   TrC   )r  r!   flipreversed)r   r?   r   r   rC   flowr   s          r'   r   z!VitsResidualCouplingBlock.forwardK  s    

 1 7JK	FQC01  !, ZFQC0 7JTXY	Z r&   r  r   r   s   @r'   r  r  D  s    Az A	r&   r  c                   .     e Zd Zddef fdZddZ xZS )VitsDilatedDepthSeparableConvr   c                 D   t         |           |j                  }|j                  }|j                  | _        t        j                  |      | _        t        j                         | _
        t        j                         | _        t        j                         | _        t        j                         | _        t        | j
                        D ]  }||z  }||z  |z
  dz  }| j                  j                  t        j                   ||||||             | j                  j                  t        j                   ||d             | j                  j                  t        j"                  |             | j                  j                  t        j"                  |              y )NrZ   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r'   r   z&VitsDilatedDepthSeparableConv.__init__X  s7   ;;%% ;;zz,/]]_!}}}}}}t' 	8A"A~H"X-8Q>G%%		 (!) +#%#	   ''		(Ha(HILLX 67LLX 67	8r&   c                 $   |||z   }t        | j                        D ]  } | j                  |   ||z        } | j                  |   |j	                  dd            j	                  dd      }t
        j                  j                  |      } | j                  |   |      } | j                  |   |j	                  dd            j	                  dd      }t
        j                  j                  |      }| j                  |      }||z   } ||z  S Nr   r=   )r   r   r   r"  	transposer   rL   gelur!  r#  r   )r   r?   r   r   r   r   s         r'   r   z%VitsDilatedDepthSeparableConv.forwardt  s   *11Ft' 	,A1D..q1&<2GHM+DLLOM,C,CAr,JKUUVWY[\MMM..}=M3D003MBM+DLLOM,C,CAr,JKUUVWY[\MMM..}=M LL7Mm+F	, $$r&   )r>   r.   r   r   s   @r'   r  r  W  s    8z 88%r&   r  c                   ,     e Zd Zdef fdZddZ xZS )VitsConvFlowr   c                    t         |           |j                  | _        |j                  dz  | _        |j                  | _        |j                  | _	        t        j                  | j
                  | j                  d      | _        t        |      | _        t        j                  | j                  | j
                  | j                  dz  dz
  z  d      | _        y )NrZ   r   r   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsrk   duration_predictor_tail_boundrD   r   r   r   r  conv_ddsr   r   s     r'   r   zVitsConvFlow.__init__  s    %11#<<A;; >>		$"4"4d6J6JAN5f=4#7#79K9Kt}}_`O`cdOd9eghir&   c                     t        j                  || j                  gdz  d      \  }}| j                  |      }| j	                  |||      }| j                  |      |z  }|j                  \  }}	}
|j                  ||	d|
      j                  dddd      }|dd | j                  f   t        j                  | j                        z  }|d| j                  d| j                  z  f   t        j                  | j                        z  }|dd| j                  z  d f   }t        |||||| j                        \  }}t        j                  ||gd      |z  }|st        j                   ||z  ddg      }||fS |d fS )	NrZ   r   rV   r=   r   r   .)rC   rD   )r!   r   r  r   r1  r   r_   reshapepermuterk   mathrh   r-  rS   rD   r
  rc   )r   r?   r   r   rC   r  r  r   
batch_sizer   lengthr@   rA   rB   rQ   rP   r  s                    r'   r   zVitsConvFlow.forward  s   "'++ft7I7I6JQ6NTU"V
Kj1m\CVW}5D'1'7'7$
Hf%--j(BOWWXY[\^_abc+C4==,@ADIIdNbNbDcc,S$--!dmmBS2S-STW[W`W`aeauauWvv#0a$--6G6I1I#J #K $$
 [ ))Z51=L#iil(BQFKOO++D= r&   r  r   r   s   @r'   r+  r+    s    	jz 	j!r&   r+  c                   ,     e Zd Zdef fdZddZ xZS )VitsElementwiseAffiner   c                 $   t         |           |j                  | _        t	        j
                  t        j                  | j                  d            | _        t	        j
                  t        j                  | j                  d            | _	        y Nr   )
r   r   r.  r   r   	Parameterr!   zeros	translate	log_scaler   s     r'   r   zVitsElementwiseAffine.__init__  sY    77ekk$--&CDekk$--&CDr&   c                 .   |s]| j                   t        j                  | j                        |z  z   }||z  }t        j                  | j                  |z  ddg      }||fS || j                   z
  t        j                  | j                         z  |z  }|d fS Nr   rZ   )r>  r!   rK   r?  rc   )r   r?   r   r   rC   rP   r  s          r'   r   zVitsElementwiseAffine.forward  s    nnuyy'@6'IIG,G#ii(E1vNOO++.%))T^^O2LL|[GD= r&   r  r   r   s   @r'   r9  r9    s    Ez E!r&   r9  c                   &     e Zd Z fdZddZ xZS )VitsStochasticDurationPredictorc                    t         |           |j                  }|j                  }t	        j
                  ||d      | _        t	        j
                  ||d      | _        t        ||j                        | _
        |dk7  rt	        j
                  ||d      | _        t	        j                         | _        | j                  j                  t        |             t!        |j"                        D ]&  }| j                  j                  t%        |             ( t	        j
                  d|d      | _        t	        j
                  ||d      | _        t        ||j                        | _        t	        j                         | _        | j,                  j                  t        |             t!        |j"                        D ]&  }| j,                  j                  t%        |             ( y )Nr   )r%  r   )r   r   r   r   r   r   r   r   r  duration_predictor_dropoutr1  r   r   r  r   r9  r   duration_predictor_num_flowsr+  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimr-  r   r   s        r'   r   z(VitsStochasticDurationPredictor.__init__  st   11	 ,,		/?AF?OQG5::

 >		)_a@DI]]_


/78v::; 	4AJJl623	4  YYq/1= ii!L:::

 --/4V<=v::; 	9AOO""<#78	9r&   c                    t        j                  |      }| j                  |      }|)t        j                  |      }|| j                  |      z   }| j	                  ||      }| j                  |      |z  }|s| j                  |      }| j                  ||      }| j                  |      |z  }t        j                  |j                  d      d|j                  d            j                  |j                  |j                        |z  }d}	|}
| j                  D ]/  } ||
|||z         \  }
}t        j                  |
dg      }
|	|z  }	1 t        j                   |
ddgd      \  }}|	t        j"                  t$        j&                  j)                  |      t$        j&                  j)                  |       z   |z  ddg      z  }	t        j"                  dt+        j,                  dt*        j.                  z        |dz  z   z  |z  ddg      |	z
  }|t        j0                  |      z
  |z  }t        j,                  t        j2                  |d            |z  }t        j"                  | ddg      }t        j4                  ||gd      }| j6                  D ],  } ||||      \  }}t        j                  |dg      }||z  }. t        j"                  d	t+        j,                  dt*        j.                  z        |dz  z   z  |z  ddg      |z
  }||z   S t9        t;        | j6                              }|d d
 |d   gz   }t        j                  |j                  d      d|j                  d            j                  |j                  |j                        |z  }|D ](  }t        j                  |dg      } ||||d      \  }}* t        j                   |ddgd      \  }}|S )Nr   rZ   )devicedtype)r   r   rV         gh㈵>g      ?r=   T)r   rC   )r!   detachr   r   r1  r   rG  rI  rH  randnsizetorM  rN  rJ  r  r   rc   r   rL   
logsigmoidr5  rJ   pir0   	clamp_minr
  r  listr  )r   r?   r   r   	durationsrC   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr  r  r  r  logqlog_determinant_sumlatentsnllr  r   log_durations                         r'   r   z'VitsStochasticDurationPredictor.forward  s   f%v&*"',,/B"Cdii(;<<Fv|4',6 ..y9M ..}lKM //>MM INN1-q)..2CDGGv}}djdpdpGq  -.) 0 A59%|R_I_62!? %*JJ/@1#$F!-@-A ',kk2CaVQR&S#J)UYY))*58P8PR\Q\8]]ammpqstou. ) 		$$((1tww;"7;KQ;N"OPS__bcefagh/0 
 $emmJ&??<OJ5??:t#DETJ"'))ZK!Q"@ii[ 9qAG

 7+/[a+b(**Wqc2#6#7
 ))C488AK#8GQJ#GH<WZ[]^Y_`cvvC:$**-.E#2J%),E FKKNAv{{1~>AA^d^j^jAk   c**Wqc2!'<V]ab
c $kk'Aq6qAOL!r&   )NNFrU   r   r   r   r   r   r   r   s   @r'   rC  rC    s    9@@ r&   rC  c                   &     e Zd Z fdZddZ xZS )VitsDurationPredictorc                    t         |           |j                  }|j                  }t	        j
                  |j                        | _        t	        j                  |j                  |||dz        | _
        t	        j                  ||j                        | _        t	        j                  ||||dz        | _        t	        j                  ||j                        | _        t	        j                  |dd      | _        |j"                  dk7  r1t	        j                  |j"                  |j                  d      | _        y y )NrZ   )r   epsr   r   )r   r   r  "duration_predictor_filter_channelsr   r   rE  r   r   r   conv_1r$  layer_norm_epsnorm_1conv_2norm_2projr   r   )r   r   r   r-  r   s       r'   r   zVitsDurationPredictor.__init__'  s    ;; CCzz&"C"CDii 2 2O[ZeijZjkll?8M8MNii+WbfgWghll?8M8MNIIoq!4	((A-		&"?"?ASASUVWDI .r&   c                 `   t        j                  |      }|)t        j                  |      }|| j                  |      z   }| j                  ||z        }t        j                  |      }| j                  |j                  dd            j                  dd      }| j                  |      }| j                  ||z        }t        j                  |      }| j                  |j                  dd            j                  dd      }| j                  |      }| j                  ||z        }||z  S r'  )r!   rQ  r   rj  relurl  r(  r   rm  rn  ro  )r   r?   r   r   s       r'   r   zVitsDurationPredictor.forward6  s   f%*"',,/B"Cdii(;<<FVl23F#V--a45??2Ff%Vl23F#V--a45??2Ff%6L01$$r&   r.   rc  r   s   @r'   re  re  &  s    X%r&   re  c                   &    e Zd ZdZdef fdZdej                  dedefdZ		 	 	 	 ddej                  d	e
ej                     d
e
ej                     de
ej                     dedeej                  e
ej                     f   fdZd Zd Zd Z xZS )VitsAttentionz?Multi-headed attention with relative positional representation.r   c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        | j                  | j
                  z  | _	        | j                  dz  | _
        | j                  | j
                  z  | j                  k7  r&t        d| j                   d| j
                   d      t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        | j                  rt        j&                  t)        j*                  d| j                  dz  dz   | j                        | j                  z        | _        t        j&                  t)        j*                  d| j                  dz  dz   | j                        | j                  z        | _        y y )NrO  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   rZ   )r   r   r   rK  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingr^   r   Linearuse_biask_projv_projq_projout_projr<  r!   rR  	emb_rel_k	emb_rel_vr   s     r'   r   zVitsAttention.__init__N  s   ++33//!--$..8}}d*MMDNN*t~~=[\`\j\j[k.t~~.>bB 
 iiV__UiiV__UiiV__U		$..$..vW\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN r&   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S rA  )viewrv  ry  r(  
contiguous)r   r  r  r  s       r'   _shapezVitsAttention._shapeg  s7    {{3GQQRSUVWbbddr&   r   key_value_statesattention_masklayer_head_maskoutput_attentionsr   c                 X	   |j                         \  }}}| j                  |      | j                  z  }	| j                  | j	                  |      d|      }
| j                  | j                  |      d|      }|| j                  z  d| j                  f} | j                  |	||      j                  | }	 |
j                  | }
 |j                  | }|
j                  d      }t        j                  |	|
j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                | j                  X| j                  | j                  |      }t        j                   |	|j                  dd            }| j#                  |      }||z  }|{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t$        j&                  j)                  |d	      }||j                         | j                  fk7  r*t        d
| j                  f d|j                                |j                  dddd      |j                  || j                  ||      z  }|j                  || j                  z  ||      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t$        j&                  j+                  || j*                  | j,                        }t        j                  ||      }|j                         || j                  z  || j                  fk7  r7t        d|| j                  || j                  f d|j                                | j                  H| j                  | j.                  |      }| j1                  |      }t        j                   ||      }||z  }|j                  || j                  || j                        }|j                  dd      }|j3                  ||| j4                        }| j7                  |      }||fS )z#Input shape: Batch x Time x Channelr=   r   rZ   z$Attention weights should be of size z	, but is NrP  z!Attention mask should be of size rV   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )rS  r  rz  r  r}  r~  rv  ry  r  r!   bmmr(  r^   rx  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rL   r`   r   r  r  '_absolute_position_to_relative_positionr3  rK  r  )r   r   r  r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                          r'   r   zVitsAttention.forwardj  s    (,,.Wa {{=1DLL@ [[]!;RE
{{4;;}#=r3GDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 '&*&C&CDNNT[&\##ll<9P9Z9Z[]_a9bcOGGXLL(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 '(,(E(EdnnV](^%#KKJW <<(8:STL<'K!&&sDNNGT]]S!++Aq1 "))#wGmmK0111r&   c           	          t        || j                  dz   z
  d      }|dkD  r&t        j                  j	                  |dd||ddg      }t        | j                  dz   |z
  d      }|d|z  z   dz
  }|d d ||f   S )Nr   r   rZ   )r]   rx  r   rL   r:   )r   relative_embeddingsr7  
pad_lengthslice_start_positionslice_end_positions         r'   r  z&VitsAttention._get_relative_embeddings  s    4#3#3a#78!<
>"$--"3"34G!QPZ\fhiklIm"n"D$4$4q$8F#BAF1AJ>B"1&:;M&M#MNNr&   c                 N   |j                         \  }}}t        j                  j                  |g d      }|j	                  ||dz  |z  g      }t        j                  j                  |d|dz
  ddg      }|j	                  ||dz   d|z  dz
  g      }|d d d ||dz
  d f   }|S )N)r   r   r   r   r   r   rZ   r   r   rS  r   rL   r:   r  r   xbatch_headsr7  r   x_flatx_finals          r'   r  z5VitsAttention._relative_position_to_absolute_position  s    !"VQ MMa!34 fqj6&9:;""6Avz1a+@A ++{FQJF
QGH!WfWfqjl23r&   c           	      F   |j                         \  }}}t        j                  j                  |d|dz
  ddddg      }|j	                  ||d|z  dz
  z  g      }t        j                  j                  ||dddg      }|j	                  ||d|z  g      d d d d dd f   }|S )Nr   r   rZ   r  r  s          r'   r  z5VitsAttention._absolute_position_to_relative_position  s    !"VQ MMa!VaZAq!!<=fF
Q&?@A ""6FAq!+<=++{FAJ?@AqrJr&   )NNNF)r   r   r   r    r   r   r!   Tensorr   r  r   boolr$   r   r  r  r  r   r   s   @r'   rs  rs  K  s    Irz r2eU\\ eC ec e 481526"'`2||`2 #5<<0`2 !.	`2
 "%,,/`2  `2 
u||Xell33	4`2DO
r&   rs  c                   $     e Zd Z fdZd Z xZS )VitsFeedForwardc                 d   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        |j                  dkD  r/|j                  dz
  dz  }|j                  dz  }||ddddg| _        y d | _        y )Nr   rZ   r   )r   r   r   r   r   ffn_dimffn_kernel_sizerj  rm  r   activation_dropoutr   
isinstance
hidden_actstrr	   act_fnr   )r   r   pad_left	pad_rightr   s       r'   r   zVitsFeedForward.__init__  s    ii 2 2FNNFDZDZ[ii0B0BFDZDZ[zz&";";<f''- !2!23DK ++DK!!A%..2q8H..!3I$iAq!<DLDLr&   c                    |j                  ddd      }|j                  ddd      }||z  }| j                  *t        j                  j	                  || j                        }| j                  |      }| j                  |      }| j                  |      }||z  }| j                  *t        j                  j	                  || j                        }| j                  |      }||z  }|j                  ddd      }|S )Nr   rZ   r   )	r4  r   r   rL   r:   rj  r  r   rm  )r   r   r   s      r'   r   zVitsFeedForward.forward  s    %--aA6#++Aq!4%4<<#MM--mT\\JMM2M2]3%4<<#MM--mT\\JMM2%4%--aA6r&   rc  r   s   @r'   r  r    s     $r&   r  c            	            e Zd Zdef fdZ	 	 ddej                  dej                  deej                     de	fdZ
 xZS )	VitsEncoderLayerr   c                 j   t         |           t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  |j                        | _        t        |      | _        t	        j                  |j                  |j                        | _        y )Nrg  )r   r   rs  	attentionr   r   hidden_dropoutr   r$  r   rk  
layer_normr  feed_forwardfinal_layer_normr   s     r'   r   zVitsEncoderLayer.__init__  sz    &v.zz&"7"78,,v'9'9v?T?TU+F3 "V-?-?VEZEZ [r&   r   r   r  r  c                 
   |}| j                  |||      \  }}| j                  |      }| j                  ||z         }|}| j                  ||      }| j                  |      }| j	                  ||z         }|f}|r||fz  }|S )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rP   s           r'   r   zVitsEncoderLayer.forward$  s     !&*nn')/ '5 '
#| ]3=(@A ))-F]3--h.FG "&Gr&   r  )r   r   r   r   r   r!   r  r"   r   r  r   r   r   s   @r'   r  r    sW    \z \ 26"'|| '' !.	
  r&   r  c                        e Zd Zdef fdZ	 	 	 	 ddej                  dej                  deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )VitsEncoderr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        |j                  | _
        y c c}w r  )r   r   r   r   r   r   num_hidden_layersr  layersgradient_checkpointing	layerdropr  s      r'   r   zVitsEncoder.__init__C  s\    mmuVMeMeGf$g!%5f%=$gh&+#)) %hs   A4r   r   r  r  output_hidden_statesreturn_dictr   c                    |rdnd }|rdnd }|t        ||j                        }||z  }t               xs t        |       }	| j                  D ]l  }
|r||fz   }t
        j                  j                  dd      }| j                  xr || j                  k  }|r|	r |
||||      }|d   }|rd}|sd|d   fz   }n ||z  }|r||fz   }|st        d |||fD              S t        |||      S )Nr%   r   r   )r  r   r  )NNc              3   &   K   | ]	  }||  y wr.   r%   ).0vs     r'   	<genexpr>z&VitsEncoder.forward.<locals>.<genexpr>}  s     mq_`_lms   )r*   r   r   )r   rN  r
   r   r  rI   randomuniformr  r  r$   r   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r'   r   zVitsEncoder.forwardJ  s;    #7BD$5b4 %7H[H[\N%402R6LT6R![[ 	PM#$58H$H! #%))"3"3Aq"9!]]U0Cdnn0TN![ -!#1!-&7	! !.a 0 , &9]1=M<O&O#-	P0 &4 1]4D Dm]4EGZ$[mmm++*
 	
r&   )NNNN)r   r   r   r   r   r!   r"   r   r  r  r   r$   r   r   r   r   s   @r'   r  r  B  s    *z * 26,0/3&*9
((9
 ''9
 !.	9

 $D>9
 'tn9
 d^9
 
uo%	&9
r&   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  de	ej                     de	e
   de	e
   d	e	e
   d
eeej                     ef   fdZ xZS )VitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r   c                 ,   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        |      | _
        t        j                  |j                  |j                  dz  d      | _        y )NrZ   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r'   r   zVitsTextEncoder.__init__  so    LL):):F<N<NPVPcPcd"6*yy!3!3V5E5E5IWXYr&   	input_idsr   r  r  r  r  r   c                    | j                  |      t        j                  | j                  j                        z  }| j                  ||||||      }|s|d   n|j                  }	| j                  |	j                  dd            j                  dd      |z  }
t        j                  |
| j                  j                  d      \  }}|s|	||f|dd  z   }|S t        |	|||j                  |j                        S )N)r   r   r  r  r  r  r   r   rZ   rV   )r*   r+   r,   r   r   )r  r5  rh   r   r   r  r*   r  r(  r!   r   r   r)   r   r   )r   r  r   r  r  r  r  r   encoder_outputsr*   r   r+   r,   rP   s                 r'   r   zVitsTextEncoder.forward  s    )))4tyyAXAX7YY,,'%)/!5# ' 
 7BOA.GhGh.88A>?II!QOR^^+0;;udkk>S>SYZ+[(((+7JKo^_^`NaaGN$/# 3)77&11
 	
r&   )NNNT)r   r   r   r    r   r   r!   r  r"   r   r  r   r$   r)   r   r   r   s   @r'   r  r    s    Zz Z 26,0/3&*#
<<#
 ''#
 !.	#

 $D>#
 'tn#
 d^#
 
uU\\"$99	:#
r&   r  c                   D    e Zd ZU eed<   dZdZdZdej                  fdZ
y)VitsPreTrainedModelr   vitsr  Tmodulec                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yt        |t        j                  t        j                  f      rt        j                  j                  |j
                         |j                  jt!        j"                  |j$                  |j&                  |j(                  d   z  z        }t        j                  j+                  |j                  | |       yyt        |t        j,                        rf|j
                  j                  j                  d|       |j.                  2|j
                  j                  |j.                     j                          yyt        |t0              r| j                   j2                  r| j                   j4                  | j                   j6                  z  }t        j                  j                  |j8                  |dz         t        j                  j                  |j:                  |dz         yyt        |t<              rI|j>                  j                  j                          |j@                  j                  j                          yy)	zInitialize the weightsr>   )r   stdNrU   r   )r   r   rO  )r  )!r   initializer_ranger  r   r{  r   datanormal_r   zero_r$  fill_r   r   initkaiming_normal_r5  rh   r  r   r   uniform_r  padding_idxrs  rx  r   ru  r  r  r9  r>  r?  )r   r  r  kry  s        r'   _init_weightsz!VitsPreTrainedModel._init_weights  s?   kk++fbii(MM&&CS&9{{&  &&( '-KK""$MM$$S)B,>,> ?@GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' -MM&&CS&9!!-""6#5#56<<> ..{{&&;;22dkk6U6UU 0 0hnE 0 0hnE '  56!!'')!!'') 7r&   N)r   r   r   r   r#   base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler  r%   r&   r'   r  r    s)    !O&*#*BII *r&   r  z@
    The complete VITS model, for text-to-speech synthesis.
    c                        e Zd Zdef fdZd Ze	 	 	 	 	 	 	 ddeej                     deej                     dee
   dee   dee   d	ee   d
eej                     deee   ef   fd       Z xZS )	VitsModelr   c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        |j                  rt        |      | _        nt        |      | _        |j                  dkD  r/t        j                  |j                  |j                         | _        t%        |      | _        |j(                  | _        |j*                  | _        |j,                  | _        | j/                          y r;  )r   r   r   r  text_encoderr  r  r   decoder"use_stochastic_duration_predictionrC  duration_predictorre  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_raterZ  noise_scale_duration	post_initr   s     r'   r   zVitsModel.__init__  s     +F3-f5	"6*44&Ef&MD#&;F&CD#"!#f.A.A6C`C`!aD "6f!= $11!--$*$?$?! 	r&   c                     | j                   S r.   )r  )r   s    r'   get_encoderzVitsModel.get_encoder  s       r&   r  r  
speaker_idr  r  r  labelsr   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j
                  j                  j                  j                  }|!|j                  d      j                  |      }	n3t        j                  |      j                  d      j                  |      }	| j                   j                  dkD  r|d|cxk  r| j                   j                  k  s(n t        d| j                   j                  dz
   d      t        |t               r"t        j"                  d|| j$                  	      }| j'                  |      j                  d      }
nd}
| j                  ||	||||
      }|s|d   n|j(                  }|j+                  dd      }|	j+                  dd      }	|s|d   n|j,                  }|s|d   n|j.                  }| j                   j0                  r!| j3                  ||	|
d| j4                        }n| j3                  ||	|
      }d| j6                  z  }t        j8                  t        j:                  |      |	z  |z        }t        j<                  t        j>                  |ddg      d      jA                         }t        jB                  |jE                         |j                  |j$                        }|j                  d      |j                  d      k  }|j                  d      j                  |	j                        }t        j                  |	d      t        j                  |d      z  }|jF                  \  }}}}t        jH                  |d      jK                  ||z  d      }t        jB                  ||j                  |j$                        }|j                  d      |k  }|j                  |j                        jK                  |||      }|tL        jN                  jQ                  |g d      ddddf   z
  }|j                  d      j+                  dd      |z  }t        jR                  |jU                  d      |      j+                  dd      }t        jR                  |jU                  d      |      j+                  dd      }|t        jV                  |      t        j:                  |      z  | jX                  z  z   }| j[                  |||
d      }||z  } | j]                  | |
      }!|!jU                  d      }!|t_        j`                  | j                   jb                        z  }"|s|!|"| f|dd z   }#|#S te        |!|"| |jf                  |jh                        S )a  
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r=   r   r   z Set `speaker_id` in the range 0-.r   )rS  
fill_valuerM  )r  r   r  r  r  r  rZ   T)rC   rZ  rU   )rN  rM  )r   r   r   r   r   r   r   r  )r   r   r   r   r   )5r   r  r  use_return_dictNotImplementedErrorr  r  r   rN  	unsqueezerT  r!   	ones_liker	  r^   r  r   fullrM  r
  r*   r(  r+   r,   r  r  r  r  ceilrK   rW  rc   longaranger]   r_   ra   r  r   rL   r:   r  squeezer   rZ  r  r  rI   prodr   r   r   r   )$r   r  r  r  r  r  r  r  
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r+   r,   rb  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskr6  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsr`  r   r   r   rP   s$                                       r'   r   zVitsModel.forward  s>   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&NOO&&33::@@
%!/!9!9"!=!@!@!L!&!;!E!Eb!I!L!LZ!X;;##a'J,B
=T[[%=%== #CDKKD\D\_`D`Caab!cdd*c*"ZZTjQUQ\Q\]
!%!3!3J!?!I!I"!M!%"//+)/!5# 0 
 7B+A.GZGlGl%//15/99!Q?4?)!,EXEdEd<G1!4M`MtMt;;9922"" 55 3 L  22=BTVhiLT///::eii58JJ\YZ!OOEIIhA,GKPPR ,,0446>O>U>U^o^v^vw%//25F5P5PQR5SS1;;A>AABTBZBZ[ OO$6:U__M`bd=ee	5>__2
A}l||Hb166zL7PRST,,}HNN8??[))!,|;%((9>>z<Yfg&):):=J\)]^_adbdad^d)ee''*44Q:YF ll4<<?K@JJ1aP#ll4<<?<OPZZ[\^_`#e&6&6{&CeiiPcFd&dgkgwgw&ww))M+>@R\`)a 33<<-?@##A&,rwwt{{7Q7Q/RR!1;?BUVWVXBYYGN-#-;;*55
 	
r&   )NNNNNNN)r   r   r   r   r   r  r   r   r!   r  r   r  r"   r   r$   r   r   r   r   r   s   @r'   r  r    s    z 4!  -115$(,0/3&*.2~
ELL)~
 !.~
 SM	~

 $D>~
 'tn~
 d^~
 **+~
 
uSz?*	+~
 ~
r&   r  )Fg      @MbP?r1  r1  )?r    r5  dataclassesr   typingr   r   r   numpyrI   r!   torch.utils.checkpointr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r)   jitscriptr8   rS   rM   r  r   r   r   r   r  r  r  r+  r9  rC  re  rs  r  r  r  r  r  r  __all__r%   r&   r'   <module>rC     sR     ! ' '     ! @ 7 B 9 < - , * 
		H	% 
:k : :$ 
:K : :   G TE%PM5%((// M5`)299 )&;299 ;|U")) Up!		 !6		 &+%BII +%\(!299 (!V!BII !$a bii a H"%BII "%JcBII cL'bii 'T$1 $NA
")) A
H/
bii /
d  */  *  *F 
]
# ]

]
@ -
.r&   