
    rhd                        d Z ddlmZ ddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ d
dlmZ  ej$                  e      Ze ed       G d de
                    Z G d dej,                        Z G d dej,                        Z G d dej,                        Z G d dej,                        Ze G d de             ZdgZy)zPyTorch UnivNetModel model.    )	dataclass)OptionalUnionN)nn   )ModelOutput)PreTrainedModel)auto_docstringlogging   )UnivNetConfigz
    Output class for the [`UnivNetModel`], which includes the generated audio waveforms and the original unpadded
    lengths of those waveforms (so that the padding can be removed by [`UnivNetModel.batch_decode`]).
    )custom_introc                   b    e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   y)UnivNetModelOutputa"  
    waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Batched 1D (mono-channel) output audio waveforms.
    waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
        The batched length in samples of each unpadded waveform in `waveforms`.
    N	waveformswaveform_lengths)
__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/univnet/modeling_univnet.pyr   r       s4     .2Ix))*148hu0018r   r   c                   T     e Zd ZdZdef fdZdej                  fdZd Z	d Z
 xZS )#UnivNetKernelPredictorResidualBlockz
    Implementation of the residual block for the kernel predictor network inside each location variable convolution
    block (LVCBlock).

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
    configc                 
   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        | j
                  dz
  dz  }t        j                  | j                        | _        t        j                  | j                  | j                  | j
                  |d      | _        t        j                  | j                  | j                  | j
                  |d      | _        y )Nr      Tpaddingbias)super__init__model_in_channelschannelskernel_predictor_conv_sizekernel_sizekernel_predictor_dropoutdropout_probleaky_relu_sloper   DropoutdropoutConv1dconv1conv2)selfr   r#   	__class__s      r   r&   z,UnivNetKernelPredictorResidualBlock.__init__=   s     	00!<<";; & 7 7##a'A-zz$"3"34YYt}}dmmT=M=MW^eij
YYt}}dmmT=M=MW^eij
r   hidden_statesc                    |}| j                  |      }| j                  |      }t        j                  j	                  || j
                        }| j                  |      }t        j                  j	                  || j
                        }||z   S N)r/   r1   r   
functional
leaky_relur-   r2   )r3   r5   residuals      r   forwardz+UnivNetKernelPredictorResidualBlock.forwardM   st     ]3

=100@U@UV

=100@U@UVx''r   c                    t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                          || j                         y Nweight_norm)r   utilsr>   hasattrparametrizationsr1   r2   r3   r>   s     r   apply_weight_normz5UnivNetKernelPredictorResidualBlock.apply_weight_normW   sR    hh**288,,m<((33??KDJJDJJr   c                     t         j                  j                  | j                         t         j                  j                  | j                         y r7   )r   r?   remove_weight_normr1   r2   r3   s    r   rE   z6UnivNetKernelPredictorResidualBlock.remove_weight_norm_   s.    
##DJJ/
##DJJ/r   )r   r   r   r   r   r&   r   r   r;   rC   rE   __classcell__r4   s   @r   r   r   3   s3    kk (U%6%6 ( 0r   r   c                   b     e Zd ZdZ	 	 d
dededef fdZdej                  fdZ	d Z
d	 Z xZS )UnivNetKernelPredictora  
    Implementation of the kernel predictor network which supplies the kernel and bias for the location variable
    convolutional layers (LVCs) in each UnivNet LVCBlock.

    Based on the KernelPredictor implementation in
    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L7).

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        conv_kernel_size (`int`, *optional*, defaults to 3):
            The kernel size for the location variable convolutional layer kernels (convolutional weight tensor).
        conv_layers (`int`, *optional*, defaults to 4):
            The number of location variable convolutional layers to output kernels and biases for.
    r   conv_kernel_sizeconv_layersc                    t         |           |j                  | _        d|j                  z  | _        || _        || _        | j                  | j                  z  | j
                  z  | j                  z  | _        | j                  | j                  z  | _        |j                  | _
        |j                  | _        |j                  | _        |j                  | _        |j"                  | _        | j                  dz
  dz  }t%        j&                  | j                  | j                  ddd      | _        t%        j*                  t-        | j                         D cg c]  }t/        |       c}      | _        t%        j&                  | j                  | j                  | j                  |d      | _        t%        j&                  | j                  | j                  | j                  |d      | _        y c c}w )Nr!   r      Tr"   )r%   r&   model_hidden_channelsconv_in_channelsconv_out_channelsrK   rL   kernel_channelsbias_channelsnum_mel_binsresnet_in_channels kernel_predictor_hidden_channelsresnet_hidden_channelsr)   resnet_kernel_sizekernel_predictor_num_blocks
num_blocksr-   r   r0   
input_conv
ModuleListranger   	resblockskernel_conv	bias_conv)r3   r   rK   rL   r#   _r4   s         r   r&   zUnivNetKernelPredictor.__init__u   s    	 & < <!"V%A%A!A 0& !!D$:$::T=R=RRUYUeUee 	 "33d6F6FF"("5"5&,&M&M#"("C"C << & 7 7**Q.14))D$;$;T=X=XZ[efmqr]bcgcrcr]s'tXY(KF(S'tu99'')=)=t?V?V`gnr
 ''););T=T=T^elp
 (us   G2spectrogramc                    |j                   \  }}}| j                  |      }t        j                  j	                  || j
                        }| j                  D ]
  } ||      } | j                  |      }| j                  |      }|j                  || j                  | j                  | j                  | j                  |      j                         }	|j                  || j                  | j                  |      j                         }
|	|
fS )a  
        Maps a conditioning log-mel spectrogram to a tensor of convolutional kernels and biases, for use in location
        variable convolutional layers. Note that the input spectrogram should have shape (batch_size, input_channels,
        seq_length).

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, input_channels, seq_length)`):
                Tensor containing the log-mel spectrograms.

        Returns:
            tuple[`torch.FloatTensor, `torch.FloatTensor`]: tuple of tensors where the first element is the tensor of
            location variable convolution kernels of shape `(batch_size, self.conv_layers, self.conv_in_channels,
            self.conv_out_channels, self.conv_kernel_size, seq_length)` and the second element is the tensor of
            location variable convolution biases of shape `(batch_size, self.conv_layers. self.conv_out_channels,
            seq_length)`.
        )shaper[   r   r8   r9   r-   r^   r_   r`   viewrL   rP   rQ   rK   
contiguous)r3   rb   
batch_sizera   
seq_lengthr5   resblockkernel_hidden_statesbias_hidden_stateskernelsbiasess              r   r;   zUnivNetKernelPredictor.forward   s	   " %0$5$5!
Az400@U@UV 	4H$]3M	4  $//>!^^M: '++!!""!!
 *, 	 $((""	

 *, 	 r   c                 v   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  D ]  }|j                            || j                          || j                         y r=   )
r   r?   r>   r@   rA   r[   r^   rC   r_   r`   r3   r>   layers      r   rC   z(UnivNetKernelPredictor.apply_weight_norm   s    hh**288,,m<((33??KDOO$^^ 	&E##%	&D$$%DNN#r   c                 <   t         j                  j                  | j                         | j                  D ]  }|j                           t         j                  j                  | j
                         t         j                  j                  | j                         y r7   )r   r?   rE   r[   r^   r_   r`   r3   rp   s     r   rE   z)UnivNetKernelPredictor.remove_weight_norm   se    
##DOO4^^ 	'E$$&	'
##D$4$45
##DNN3r   )r      r   r   r   r   r   intr&   r   r   r;   rC   rE   rG   rH   s   @r   rJ   rJ   d   sO    & !"	$
$
 $
 	$
L,5#4#4 ,\	$4r   rJ   c                        e Zd ZdZdededef fdZddZ	 	 ddej                  dej                  d	ej                  ded
ef
dZ
d Zd Z xZS )UnivNetLvcResidualBlocka  
    Implementation of the location variable convolution (LVC) residual block for the UnivNet residual network.

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        kernel_size (`int`):
            The kernel size for the dilated 1D convolutional layer.
        dilation (`int`):
            The dilation for the dilated 1D convolutional layer.
    r   r*   dilationc                 N   t         |           |j                  | _        || _        || _        |j                  | _        | j
                  | j                  dz
  z  dz  }t        j                  | j                  | j                  | j                  || j
                        | _	        y )Nr   r!   )r#   rx   )
r%   r&   rO   hidden_channelsr*   rx   r-   r   r0   conv)r3   r   r*   rx   r#   r4   s        r   r&   z UnivNetLvcResidualBlock.__init__   s     	%;;&  & 7 7--4#3#3a#78A=II    ]]
	r   c                    |}t         j                  j                  || j                        }| j	                  |      }t         j                  j                  || j                        }| j                  ||||      }t        j                  |d d d | j                  d d f         t        j                  |d d | j                  d d d f         z  }||z   }|S N)hop_size)
r   r8   r9   r-   r{   location_variable_convolutionr   sigmoidrz   tanh)r3   r5   kernelr$   r~   r:   s         r   r;   zUnivNetLvcResidualBlock.forward   s     00@U@UV		-000@U@UV::=&RVai:jmA7M9M9M7Mq4P&QRUZU_U_!T113Q67V
 
 !=0r   r5   r   r$   r~   c                 F   |j                   \  }}}|j                   \  }}}	}
}|||z  k7  rt        d||z   d| d      |t        |
dz
  dz        z  }t        j                  j                  |||fdd      }|j                  d|d|z  z   |      }||k  r$t        j                  j                  |d|fdd      }|j                  d||      }|d	d	d	d	d	d	d	d	d	|f   }|j                  dd
      }|j                  d
|
d      }t        j                  d||      }|j                  t        j                        }|j                  d      j                  d      j                  t        j                        }||z   }|j                         j                  ||	d      }|S )u  
        Performs location-variable convolution operation on the input sequence (hidden_states) using the local
        convolution kernel. This was introduced in [LVCNet: Efficient Condition-Dependent Modeling Network for Waveform
        Generation](https://huggingface.co/papers/2102.10815) by Zhen Zheng, Jianzong Wang, Ning Cheng, and Jing Xiao.

        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.

        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, in_channels, in_length)`):
                The input sequence of shape (batch, in_channels, in_length).
            kernel (`torch.FloatTensor` of shape `(batch_size, in_channels, out_channels, kernel_size, kernel_length)`):
                The local convolution kernel of shape (batch, in_channels, out_channels, kernel_size, kernel_length).
            bias (`torch.FloatTensor` of shape `(batch_size, out_channels, kernel_length)`):
                The bias for the local convolution of shape (batch, out_channels, kernel_length).
            dilation (`int`, *optional*, defaults to 1):
                The dilation of convolution.
            hop_size (`int`, *optional*, defaults to 256):
                The hop_size of the conditioning sequence.
        Returns:
            `torch.FloatTensor`: the output sequence after performing local convolution with shape (batch_size,
            out_channels, in_length).
        z#Dim 2 of `hidden_states` should be z
) but got zX. Please check `hidden_states` or `kernel` and `hop_size` to make sure they are correct.r   r!   constantr   r   Nrs   zbildsk,biokl->bolsd)memory_format)rd   
ValueErrorru   r   r8   padunfold	transposer   einsumtochannels_last_3d	unsqueezerf   re   )r3   r5   r   r$   rx   r~   batchra   	in_lengthout_channelsr*   kernel_lengthr#   output_hidden_statess                 r   r   z5UnivNetLvcResidualBlock.location_variable_convolution  s   < ,11q)=C\\:q,]125mh6N5OzZcYd e] ] 
 S+/Q!677 ))-'79KZYZ[%,,Q1w;0FQhMM--ma]JXYZM%,,Q(C%aAq)8)&;<%//15%,,QQ?  %||,A=RXY366UE[E[6\~~b!++B/22AWAW2X3d:3>>@EEe\[]^##r   c                     t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         y r=   )r   r?   r>   r@   rA   r{   rB   s     r   rC   z)UnivNetLvcResidualBlock.apply_weight_normP  sF    hh**288,,m<((33??KDIIr   c                 V    t         j                  j                  | j                         y r7   )r   r?   rE   r{   rF   s    r   rE   z*UnivNetLvcResidualBlock.remove_weight_normW  s    
##DII.r      )r   r   )r   r   r   r   r   ru   r&   r;   r   r   r   rC   rE   rG   rH   s   @r   rw   rw      s    


 
 	
,* ?$((?$ !!?$ 	?$
 ?$ ?$B/r   rw   c                   x     e Zd ZdZ	 ddededef fdZdej                  dej                  fdZ	d	 Z
d
 Z xZS )UnivNetLvcBlocka#  
    Implementation of the location variable convolution (LVC) residual block of the UnivNet residual block. Includes a
    `UnivNetKernelPredictor` inside to predict the kernels and biases of the LVC layers.

    Based on LVCBlock in
    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L98)

    Parameters:
        config (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        layer_id (`int`):
            An integer corresponding to the index of the current LVC resnet block layer. This should be between 0 and
            `len(config.resblock_stride_sizes) - 1)` inclusive.
        lvc_hop_size (`int`, *optional*, defaults to 256):
            The hop size for the location variable convolutional layers.
    r   layer_idlvc_hop_sizec                    t         |           |j                  | _        |j                  |   | _        |j                  |   | _        |j                  |   | _	        || _
        |j                  | _        t        | j                        | _        t        j                  | j                  | j                  d| j                  z  | j                  | j                  dz  | j                  dz  z   | j                  dz        | _        t#        || j
                  | j                        | _        t        j&                  t)        | j                        D cg c]&  }t+        || j
                  | j                  |         ( c}      | _        y c c}w )Nr!   )strider#   output_padding)r%   r&   rO   rz   resblock_kernel_sizesr*   resblock_stride_sizesr   resblock_dilation_sizes	dilationscond_hop_lengthr-   lenrZ   r   ConvTranspose1d	convt_prerJ   kernel_predictorr\   r]   rw   r^   )r3   r   r   r   ir4   s        r   r&   zUnivNetLvcBlock.__init__m  s<    	%;;!77A228<77A+ & 7 7dnn-++    O;;KK1$t{{Q6;;?
 !7vt?O?OQUQ`Q` a[`aeapap[qrVW$VT-=-=t~~a?PQr
rs   +Fr5   rb   c           	      R   t         j                  j                  || j                        }| j	                  |      }| j                  |      \  }}t        | j                        D ]?  \  }}|d d |d d d d d d d d f   }|d d |d d d d f   } ||||| j                        }A |S r}   )	r   r8   r9   r-   r   r   	enumerater^   r   )	r3   r5   rb   rl   rm   r   ri   r   r$   s	            r   r;   zUnivNetLvcBlock.forward  s     00@U@UV}5//<$T^^4 	aKAxQ1aA-.F!Q1*%D$]FD4K_K_`M	a
 r   c                 b   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  j                          | j                  D ]  }|j                           y r=   )	r   r?   r>   r@   rA   r   r   rC   r^   ro   s      r   rC   z!UnivNetLvcBlock.apply_weight_norm  sw    hh**288,,m<((33??KDNN#//1^^ 	&E##%	&r   c                     t         j                  j                  | j                         | j                  j                          | j
                  D ]  }|j                           y r7   )r   r?   rE   r   r   r^   rr   s     r   rE   z"UnivNetLvcBlock.remove_weight_norm  sI    
##DNN3002^^ 	'E$$&	'r   r   rt   rH   s   @r   r   r   [  sW    *  	

 
 	
<U%6%6 UEVEV &'r   r   c                       e Zd ZU eed<   dZdef fdZe	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     de
e   deeej                     ef   fd	       Zd
 Zd Zd Z xZS )UnivNetModelr   input_featuresc                 ^   t         |   |       t        |j                        | _        |j
                  | _        t        j                  |j                  |j                  dddd      | _
        t        |j                        }d}g }|j                  D ]  }||z  }|j                  |        t        j                  t        |      D cg c]  }t        ||||          c}      | _        t        j                  |j                  dddd      | _        | j%                          y c c}w )N   r   r   reflect)r*   r   r#   padding_mode)r   r   )r#   r   )r%   r&   r   r   num_kernelsr-   r   r0   r'   rO   conv_prer   appendr\   r]   r   r^   	conv_post	post_init)r3   r   
num_layers
hop_lengthhop_lengthsr   r   r4   s          r   r&   zUnivNetModel.__init__  s    v;;< & 7 7		$$(("
 556

22 	+F#f,Jz*	+  z*   !,Q	
 6#?#?Aq_hi 	s   D*noise_sequencepadding_mask	generatorreturn_dictreturnc                    ||n| j                   j                  }|j                         dk(  }|s|j                  d      }|j                  \  }}}	|'|j                         dk(  }
|
sX|j                  d      }nF||| j                   j
                  f}t        j                  |||j                  |j                        }|j                  d   }|dkD  r|dk(  r|j                  |dd      }n|dkD  r|dk(  r|j                  |dd      }||k7  rt        d| d| d      |J|j                         dk(  r|j                  d      }|j                  d   }||k7  rt        d	| d| d      |j                  d
d      }|j                  d
d      }| j                  |      }| j                  D ]  } |||      } t        j                   j#                  || j$                        }| j'                  |      }t        j(                  |      }|j+                  d      }d}|t        j,                  |d      }|s||f}|S t/        ||      S )a  
        noise_sequence (`torch.FloatTensor`, *optional*):
            Tensor containing a noise sequence of standard Gaussian noise. Can be batched and of shape `(batch_size,
            sequence_length, config.model_in_channels)`, or un-batched and of shape (sequence_length,
            config.model_in_channels)`. If not supplied, will be randomly generated.
        padding_mask (`torch.BoolTensor`, *optional*):
            Mask indicating which parts of each sequence are padded. Mask values are selected in `[0, 1]`:

            - 1 for tokens that are **not masked**
            - 0 for tokens that are **masked**

            The mask can be batched and of shape `(batch_size, sequence_length)` or un-batched and of shape
            `(sequence_length,)`.
        generator (`torch.Generator`, *optional*):
            A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
            deterministic.
            return_dict:
            Whether to return a [`~utils.ModelOutput`] subclass instead of a plain tuple.

        Example:

         ```python
         >>> from transformers import UnivNetFeatureExtractor, UnivNetModel
         >>> from datasets import load_dataset, Audio

         >>> model = UnivNetModel.from_pretrained("dg845/univnet-dev")
         >>> feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")

         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> # Resample the audio to the feature extractor's sampling rate.
         >>> ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
         >>> inputs = feature_extractor(
         ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
         ... )
         >>> audio = model(**inputs).waveforms
         >>> list(audio.shape)
         [1, 140288]
         ```
        Nr   r   )r   dtypedevicer   z&The batch size of `noise_sequence` is z+ and the batch size of `input_features` is z', but the two are expected to be equal.z$The batch size of `padding_mask` is r!   )dim)r   r   )r   use_return_dictr   r   rd   r'   r   randnr   r   repeatr   r   r   r^   r   r8   r9   r-   r   r   squeezesumr   )r3   r   r   r   r   r   spectrogram_batchedspectrogram_batch_sizespectrogram_lengthra   noise_sequence_batchednoise_sequence_shapenoise_sequence_batch_sizepadding_mask_batch_sizer5   ri   waveformr   outputss                      r   r;   zUnivNetModel.forward  s   ` &1%<k$++B]B] -002a7"+55a8N8F8L8L5 2A%%3%7%7%9Q%>")!/!9!9!!< %;<NPTP[P[PmPm#n "[[$	AUAU^l^s^sN %3$8$8$;!!A%*Cq*H+223I1aPN&*/E/J+223LaQRSN$(>>89R8S T((>'??fh 
 #!Q&+55a8&2&8&8&;#&*@@ :;R:S T,,B+CCjl  '00A6'11!Q7m4 	DH$]NCM	D 00@U@UV}5

=1 !((+  #$yy1=!12GN!-
 	
r   c                 P   t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyy)zInitialize the weights.g        )meanstdN)
isinstancer   Linearr0   r   weightdatanormal_r   initializer_ranger$   zero_)r3   modules     r   _init_weightszUnivNetModel._init_weightsR  ss    fryy"))R5G5GHIMM&&CT[[5R5R&S{{&  &&( ' Jr   c                 R   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  D ]  }|j                            || j                         y r=   )	r   r?   r>   r@   rA   r   r^   rC   r   ro   s      r   rC   zUnivNetModel.apply_weight_normY  sq    hh**288,,m<((33??KDMM"^^ 	&E##%	&DNN#r   c                     t         j                  j                  | j                         | j                  D ]  }|j                           t         j                  j                  | j
                         y r7   )r   r?   rE   r   r^   r   rr   s     r   rE   zUnivNetModel.remove_weight_normc  sM    
##DMM2^^ 	'E$$&	'
##DNN3r   )NNNN)r   r   r   r   r   main_input_namer&   r
   r   r   r   	Generatorboolr   tupler   r;   r   rC   rE   rG   rH   s   @r   r   r     s    &O%} %N  7;48/3&*x
))x
 !!2!23x
 u001	x

 EOO,x
 d^x
 
uU&&');;	<x
 x
t)$4r   r   )r   dataclassesr   typingr   r   r   torch.utils.checkpointr   modeling_outputsr   modeling_utilsr	   r?   r
   r   configuration_univnetr   
get_loggerr   loggerr   Moduler   rJ   rw   r   r   __all__r   r   r   <module>r      s    " ! "    + - , 0 
		H	% 	9 	9 	9.0")) .0bu4RYY u4p|/bii |/~M'bii M'` {4? {4 {4| 
r   