
    rhA                    R   d Z ddlZddlmZmZ ddlZddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,  e)jZ                  e.      Z/dZ0dejb                  de2de2fdZ3	 d}dejb                  de2deejb                     fdZ4	 	 d~de5e2e2f   de6de2deejn                     de2dejp                  fd Z9 G d! d"e      Z: G d# d$e      Z; G d% d&e      Z< G d' d(e	jz                        Z> G d) d*e	jz                        Z? G d+ d,e	jz                        Z@ G d- d.ej                  jz                        ZA G d/ d0e	jz                        ZB G d1 d2e	jz                        ZC G d3 d4e	jz                        ZD G d5 d6e	jz                        ZE G d7 d8e	jz                        ZF G d9 d:e	jz                        ZG G d; d<e	jz                        ZH G d= d>e	jz                  e%      ZI G d? d@e	jz                  e%      ZJ G dA dBe	jz                  e%      ZK G dC dDe	jz                        ZL G dE dFe	jz                        ZM G dG dHe      ZN G dI dJe      ZOe( G dK dLe&             ZP G dM dNeP      ZQ G dO dPeP      ZR G dQ dReP      ZS G dS dTeP      ZT G dU dVeP      ZU G dW dXeP      ZV G dY dZeP      ZW G d[ d\eP      ZX G d] d^e	jz                        ZY G d_ d`e	jz                        ZZ e(dab       G dc ddeP             Z[ e(deb       G df dgePe             Z\	 	 	 	 	 	 	 	 ddhePdej                  dieej                     deejn                     dje6dke6dle6dmee	jz                     dne^doe^deej                  e5ej                  ej                  f   f   fdpZ_ e(dqb       G dr dseP             Z` e(dtb       G du dveP             Za G dw dxe	jz                        Zb e(dyb       G dz d{e&             Zcg d|Zdy)zPyTorch SpeechT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr+   6   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     |dkD  r | dd|dz
  d|f   } ||dd|dz
  d|f   }| j                  | j                        }| ddddf   j                         |ddddf<   |j                  |dk(  d       ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr#         Y        )r$   r%   r&   r(   )r-   r.   r/   shifted_input_valuess       r*   shift_spectrograms_rightr4   F   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r,   r%   	mask_probmask_length	min_masksreturnc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr6   r5   r7   sequence_lengths     r*   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr,   Nr#   dtyper   F)replace)r'   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper=   put_along_axis)r%   r5   r6   r/   r7   
batch_sizerB   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr>   r?   spec_aug_mask_idxdummy_mask_idxoffsetsr@   rA   s    `` `            @@r*   _compute_mask_indicesrd   \   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr
   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   rm   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r,   c                 J    | j                  |      }| j                  |      }|S N)ru   rw   ry   hidden_statess     r*   forwardz$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r,   r   __name__
__module____qualname__rm   r   __classcell__r|   s   @r*   rf   rf      s    Ar,   rf   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rh   T)elementwise_affine)rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   	LayerNorm
layer_normr
   rv   rw   rx   s      r*   rm   z#SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r,   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr#   )ru   	transposer   rw   r   s     r*   r   z"SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r,   r   r   r   s   @r*   r   r      s    Ar,   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rh   T)
num_groupsnum_channelsaffine)rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   r
   rv   rw   	GroupNormr   rx   s      r*   rm   z#SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr,   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r~   )ru   r   rw   r   s     r*   r   z"SpeechT5GroupNormConvLayer.forward  s2    		-066r,   r   r   r   s   @r*   r   r     s    r r,   r   c            	            e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         dd	e
j                  d
efd       Z	 dd	e
j                  ded
ee   fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.num_positionsembedding_dimpadding_idxc                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y N   )rl   rm   offsetr   r   make_weights)ry   r   r   r   r|   s       r*   rm   z.SpeechT5SinusoidalPositionalEmbedding.__init__"  s@    *&-$++5}kRr,   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsrD   deviceF
persistent)get_embeddinghasattrtor   rD   r   register_buffer)ry   r   r   r   emb_weightss        r*   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights)  s[    ((T4#%..t||/A/A$,,J]J].^KYFr,   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rC   r   dimr#   N)mathlogtorchexprQ   int64float	unsqueezecatsincosviewrN   r   get_default_dtype)r   r   r   half_dimembs        r*   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding1  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r,   r   past_key_values_lengthc                    |j                         \  }}| j                  || j                  |      j                  |j                        }| j                  dz   |z   }|| j
                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j
                  j                  d|j                  d            j                  ||d      j                         S )Nr   r   r#   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rJ   )ry   r   r   bszseq_lenposition_idsmax_poss          r*   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardC  s     ~~'W>>y$JZJZ\rsvv

 ""Q&0T\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVXY``bbr,   c                     |j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner<   r   cumsumtype_aslong)ry   r   r   r   maskincremental_indicess         r*   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsR  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r,   r~   r   )r   r   r   __doc__r<   r   rm   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r*   r   r     s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1" U]]_c cs c c bc88478QYZ]Q^8r,   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )ri   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rl   rm   r   rq   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsru   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r
   rv   rw   )ry   rz   r   r   r   r   r|   s         r*   rm   z(SpeechT5PositionalConvEmbedding.__init__e  s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI+F,J,JK !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )r   ru   r   rw   r   s     r*   r   z'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r,   r   r   s   @r*   r   r   d  s    ABr,   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodingu[   
    Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
    c                    t        j                  ||      }t        j                  d|      j                  d      }t        j                  t        j                  d|dt         j
                        j                         t        j                  d      |z   z        }t        j                  |j                         |z        |d d dd df<   t        j                  |j                         |z        |d d dd df<   |j                  d      }t        | 1          | j                  d|d       t        j                  |	      | _        || _        t        j$                  t        j&                  d
            | _        y )Nr   r   r   rC   g     @peFr   p      ?)r   rN   rQ   r   r   r   r   r   r   r   r   rl   rm   r   r   Dropoutdropoutr   	Parametertensoralpha)ry   r   r   max_lenr   positiondiv_termr|   s          r*   rm   z)SpeechT5ScaledPositionalEncoding.__init__  s    [[#&<<7+55a899U\\!S!5;;GMMOTXT\T\]dTehkTkRllmii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,\\%,,s"34
r,   c                     || j                   | j                  d d d |j                  d      f   z  z   }| j                  |      }|S )Nr   )r   r   r   r   )ry   r   s     r*   r   z(SpeechT5ScaledPositionalEncoding.forward  sB    DJJMchhqkM)9!:::ll3
r,   )i  )r   r   r   r   rm   r   r   r   s   @r*   r   r     s    5r,   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncodingc                     t         |           || _        || _        t        j
                  j                  d|z  |      | _        y r   )rl   rm   r   
max_lengthr   r   	Embeddingpe_k)ry   r   r  r|   s      r*   rm   z+SpeechT5RelativePositionalEncoding.__init__  s8    $HH&&q:~s;	r,   c                 ~   |j                   d   }t        j                  d|      j                  |j                  t        j
                        }|d d d f   |d d d f   z
  }| j                   ||| j                   k  <   | j                  dz
  ||| j                  k\  <   || j                  z   }| j                  |      S )Nr   r   r   rD   )r%   r   rQ   r   r   r   r  r  )ry   r   r   pos_seqs       r*   r   z*SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55/3.>4??**+.2oo.A4??*+DOO+yy!!r,   )i  r   r   s   @r*   r   r     s    <	"r,   r   c                   $     e Zd Z fdZd Z xZS )r   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )Nr   r   r   )rl   rm   num_pad_remove)ry   r   r|   s     r*   rm   zSpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car,   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r	  r   s     r*   r   zSpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr,   r   r   s   @r*   r   r     s    Kr,   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r{   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rl   rm   feat_extract_normr   rM   num_feat_extract_layersrf   r   r'   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)ry   rz   ir  r|   s       r*   rm   zSpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNIJ,Va!eDN K %%0HMfNlNlHmCD*6A>K  01I1I0JJst  ==5&+#"Ns   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_gradr  )ry   params     r*   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s(    __& 	(E"'E	(#r,   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S NT)r  trainingr  r  )ry   r-   r   
conv_layers       r*   r   zSpeechT5FeatureEncoder.forward  sP    $QW- 4==*.M'** 	6J&}5M	6 r,   )r   r   r   r   rm   r  r   r   r   s   @r*   r  r    s    8#&$

r,   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Nr#   eps)rl   rm   r   r   rn   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   ry   rz   r|   s     r*   rm   z"SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r,   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS r~   )r   r(  r   )ry   r   norm_hidden_statess      r*   r   z!SpeechT5FeatureProjection.forward  s:    !__];(:;]3000r,   r   r   s   @r*   r"  r"    s    <1r,   r"  c                   6    e Zd Z fdZd Z	 	 ddej                  deej                     deej                     fdZ
dedej                  fdZd	eej                  ef   fd
Z	 	 ddej                  deej                     deej                     fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t         |           || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        t!        |      | _        t%        |j&                  |j(                  z   dz   |j                  |j(                        | _        y )Nr2   r   )rl   rm   rz   r  feature_encoderr"  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr    pos_sinusoidal_embedr*  s     r*   rm   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r,   c                 8    | j                   j                          y r~   )r0  r  ry   s    r*   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r,   r-   r/   mask_time_indicesc                    | j                  |      }|j                  dd      }|| j                  |j                  d   |      }| j	                  |      \  }}| j                  |||      }| j                  |      }||z   }| |j                  d      j                         }n=t        j                  |j                  d d t        j                  |j                        }| j                  |      }||z   }||fS )Nr   r   )r<  r/   r   )r0  r   "_get_feature_vector_attention_maskr%   r1  _mask_hidden_statesr6  r   r   r   rN   r   r8  )	ry   r-   r/   r<  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r*   r   z#SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S''00->~ 1 
 %)$7$7$F!%(AA%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%(HHn,,r,   feature_vector_lengthc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr#   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r%   rN   rD   r   rQ   fliprO   )ry   rD  r/   non_padded_lengthsoutput_lengthsr[   s         r*   r>  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   r]   c                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)r>   ri   rj   s      r*   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s"     99\K7wWZ[[[r,   )ziprz   rr   rs   )ry   r]   rP  ri   rj   s        r*   rG  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r,   r   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r5   r6   r/   r7   r  )r5   r6   r7   r#   )getattrrz   r   r5  r   rD   r2  r  rd   mask_time_lengthmask_time_min_masksr   r   r   rO   r3  mask_feature_lengthmask_feature_min_masksexpand)ry   r   r<  r/   r[   rA   r   mask_feature_indicess           r*   r?  z/SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r,   NN)r   r   r   rm   r;  r   r   r   
LongTensorFloatTensorr   r<   r>  r   rG  r?  r   r   s   @r*   r.  r.    s    
"2 6:9=	 -ll - !!1!12 - $E$5$56	 -F ]b]m]m  eEDTDTVYDY>Z & :>59	,((, $E$5$56, !!1!12	,r,   r.  c                   f     e Zd Z fdZd Z	 ddej                  deej                     fdZ xZ	S )SpeechT5SpeechDecoderPrenetc           	      X   t         |           || _        t        j                  t        |j                        D cg c]=  }t        j                  |dk(  r|j                  n|j                  |j                        ? c}      | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        t        j                  |j"                  |j                  z   |j                        | _        y c c}w r  )rl   rm   rz   r   r  rM   speech_decoder_prenet_layersr'  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr   positional_dropoutr7  encode_positionsspeaker_embedding_dimspeaker_embeds_layerry   rz   r  r|   s      r*   rm   z$SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 	 		+,6F''v7Y7Y66
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD'c                     t        j                  |d   |      }|j                  d      j                  |j	                  d      dd      }t        j
                  |dk(  |d      dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   where)ry   inputs_embedsr   r   	all_maskss        r*   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr,   r-   speaker_embeddingsc                 8   |}| j                   D ]M  }t        j                  j                   ||            }| j	                  || j
                  j                        }O | j                  |      }| j                  |      }|t        j                  j                  |      }|j                  d      j                  d|j                  d      d      }t        j                  ||gd      }t        j                  j                  | j                  |            }|S )Nr   r#   r   )rd  r   
functionalrelurq  rz   speech_decoder_prenet_dropoutre  rg  	normalizer   rY  r   r   r   ri  )ry   r-   rr  ro  r  s        r*   r   z#SpeechT5SpeechDecoderPrenet.forward  s     %[[ 	oEMM..u]/CDM 44]DKKDmDmnM	o ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}6H&IrRMMM..t/H/H/WXMr,   r~   )
r   r   r   rm   rq  r   r   r   r   r   r   s   @r*   r_  r_    s8    u,K 6:ll %U\\2r,   r_  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerc                 
   t         |           |dk(  r|j                  }n|j                  }||j                  dz
  k(  r|j                  }n|j                  }t        j                  |||j                  d|j                  dz
  dz  d      | _        t        j                  |      | _
        ||j                  dz
  k  rt        j                         | _        nd | _        t        j                  |j                        | _        y )Nr   r   r   F)ri   rj   r   rk   )rl   rm   rb  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rq   speech_decoder_postnet_kernelru   BatchNorm1d
batch_normTanhrw   r   speech_decoder_postnet_dropoutr   )ry   rz   r{   ro   rp   r|   s        r*   rm   z#SpeechT5BatchNormConvLayer.__init__  s    q= --K ==Kv;;a??!..L!>>LII<<99A=!C
	 ..6f::Q>> ggiDO"DOzz&"G"GHr,   c                     | j                  |      }| j                  |      }| j                  | j                  |      }| j                  |      }|S r~   )ru   r  rw   r   r   s     r*   r   z"SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r,   r   r   r   s   @r*   ry  ry    s    I<r,   ry  c                   ^     e Zd Z fdZdej
                  fdZdej
                  fdZ xZS )SpeechT5SpeechDecoderPostnetc           	         t         |           || _        t        j                  |j
                  |j                  |j                  z        | _        t        j                  |j
                  |j                        | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r~   )rl   rm   rz   r   r'  r   rb  r.   feat_outprob_outr  rM   r|  ry  rd  rj  s      r*   rm   z%SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<ghq'2h
hs   (Cr   c                    | j                  |      j                  |j                  d      d| j                  j                        }| j                  |      }| j                  |      j                  |j                  d      d      }|||fS )Nr   r#   )r  r   r   rz   rb  postnetr  )ry   r   outputs_before_postnetoutputs_after_postnetlogitss        r*   r   z$SpeechT5SpeechDecoderPostnet.forward  s~    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%'<fDDr,   c                     |j                  dd      }| j                  D ]
  } ||      } ||j                  dd      z   S r   )r   rd  )ry   r   layer_outputr  s       r*   r  z$SpeechT5SpeechDecoderPostnet.postnet  sI    $..q!4[[ 	/E .L	/|55a;;;r,   )	r   r   r   rm   r   r   r   r  r   r   s   @r*   r  r    s*    	
EU\\ E<U\\ <r,   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )SpeechT5TextEncoderPrenetc                    t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        y r~   )rl   rm   rz   r   r  
vocab_sizer   r    embed_tokensr   rf  max_text_positionsrg  r*  s     r*   rm   z"SpeechT5TextEncoderPrenet.__init__  se    LL):):F<N<NPVPcPcd @%%%%!
r,   r   c                 J    | j                  |      }| j                  |      }|S r~   )r  rg  )ry   r   ro  s      r*   r   z!SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r,   )r   r   r   rm   r   r   r   r   r   s   @r*   r  r    s    
 r,   r  c                   l     e Zd Z fdZ	 	 ddej
                  deej                     dee   fdZ	 xZ
S )SpeechT5TextDecoderPrenetc                    t         |           || _        t        j                  |j
                        | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                  |j                  |j                        | _        t!        |j"                  |j                  z   dz   |j                  |j                        | _        y )Nr   r   )rl   rm   rz   r   r   rf  r   scale_embeddingr   sqrtr   embed_scaler  r  r    r  r   r  embed_positionsr*  s     r*   rm   z"SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r,   r   r/   past_key_valuesc                 n   |&|j                         }|j                  d|d         }nt        d      d}|5t        |t              s|d   d   j
                  d   n|j                         }| j                  ||      }| j                  |      | j                  z  }||z  }| j                  |      }||fS )Nr#   z'You have to specify `decoder_input_ids`r   r   )r   r   r'   
isinstancer   r%   get_seq_lengthr  r  r  r   )ry   r   r/   r  input_shaper   	positionsro  s           r*   r   z!SpeechT5TextDecoderPrenet.forward  s      #..*K!r;r?;IFGG!"& "/59  "1%++B/$335 # ((4JK	)))4t7G7GG"]3n,,r,   r[  )r   r   r   rm   r   r   r   r\  r   r   r   r   s   @r*   r  r    sD    
" 6:+/	-<<- !!1!12- "%	-r,   r  c                   J     e Zd Z fdZdej
                  fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t         |           || _        t        j                  |j
                  |j                  d      | _        y )NFrk   )rl   rm   rz   r   r'  r   r  lm_headr*  s     r*   rm   z#SpeechT5TextDecoderPostnet.__init__;  s5    yy!3!3V5F5FUSr,   r   c                 $    | j                  |      S r~   r  r   s     r*   r   z"SpeechT5TextDecoderPostnet.forward@  s    ||M**r,   c                     | j                   S r~   r  r:  s    r*   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddingsC  s     ||r,   c                     || _         y r~   r  ry   new_embeddingss     r*   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsH  s	    %r,   )
r   r   r   rm   r   r   r   r  r  r   r   s   @r*   r  r  :  s#    T
+U\\ +
&r,   r  c                   v    e Zd ZdZ	 	 	 	 ddededee   dee   dee   dee   f fdZ	 	 	 	 	 	 	 dd	e	j                  d
ee	j                     dee   dee	j                     dee	j                     dee	j                     dedee	j                     dee	j                  ee	j                     ee   f   fdZ xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    	embed_dim	num_headsr   
is_decoderrk   	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rl   rm   r  r  r   head_dimr'   scalingr  r  r   r'  k_projv_projq_projout_proj)ry   r  r  r   r  rk   r  r|   s          r*   rm   zSpeechT5Attention.__init__R  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr,   r   key_value_statespast_key_valuer/   layer_head_maskposition_biasoutput_attentionscache_positionr8   c	                 <   |du}	|j                         \  }
}}| j                  |      | j                  z  }|St        |t              rA|j
                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j                  |
d| j                   | j"                        j%                  dd      }|j                  |
d| j                   | j"                        j%                  dd      }|D|	s|nd}j'                  ||| j                  d|i      \  }}|	rd|j
                  | j                  <   |
| j                   z  d| j"                  f}|j                  |
|| j                   | j"                        j%                  dd      } |j(                  | } |j(                  | } |j(                  | }|j                  d      }t+        j,                  ||j%                  dd            }|j                         |
| j                   z  ||fk7  r/t/        d|
| j                   z  ||f d|j                                ||j1                         j                  |
| j                   z  d| j"                        j%                  d	d      }t+        j2                  ||j%                  d
d            }|j%                  d	d      j                  |
| j                   z  |j                  d	      |j                  d            }||z  }|{|j                         |
d||fk7  r#t/        d|
d||f d|j                                |j                  |
| j                   ||      |z   }|j                  |
| j                   z  ||      }t4        j6                  j9                  |d      }||j                         | j                   fk7  r*t/        d| j                   f d|j                                |j                  dddd      |j                  |
| j                   ||      z  }|j                  |
| j                   z  ||      }|r?|j                  |
| j                   ||      }|j                  |
| j                   z  ||      }nd}t4        j6                  j;                  || j:                  | j<                        }t+        j,                  ||      }|j                         |
| j                   z  || j"                  fk7  r7t/        d|
| j                   || j"                  f d|j                                |j                  |
| j                   || j"                        }|j%                  dd      }|j)                  |
|| j>                        }| jA                  |      }||fS )z#Input shape: Batch x Time x ChannelNr#   r   r   r  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   z/Head mask for a single layer should be of size )r   r  z `attn_output` should be of size )!r   r  r  r  r   
is_updatedgetr  cross_attention_cacheself_attention_cacherd  keysvaluesr  r  r   r  r  r   updaterY   r   bmmr'   
contiguousmatmulr   rt  softmaxr   r  r  r  )ry   r   r  r  r/   r  r  r  r  is_cross_attentionr   tgt_lenr\   query_statesr  curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                              r*   r   zSpeechT5Attention.forwardo  s    .T9',,.Wa {{=1DLL@%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=DNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  $$//166sT^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<dnn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r,   )r2   FTN)NNNNNFN)r   r   r   r   r<   r   r   rO   rm   r   r   r   tupler   r   r   s   @r*   r  r  L  s=    $'%*#$(CC C %	C
 TNC tnC D>C@ 48*.152604"'15~2||~2 #5<<0~2 !	~2
 !.~2 "%,,/~2  -~2  ~2 !.~2 
u||Xell3Xe_D	E~2r,   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||j                        | _        t        j                  |j                        | _        y r~   )rl   rm   r   r   activation_dropoutintermediate_dropoutr'  r   intermediate_denser  
hidden_actstrr
   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)ry   rz   intermediate_sizer|   s      r*   rm   zSpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''-'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r,   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r~   )r  r  r  r  r  r   s     r*   r   zSpeechT5FeedForward.forward  sX    //>00?11-@))-8++M:r,   r   r   s   @r*   r  r    s    @r,   r  c                        e Zd Zdef fdZ	 	 	 	 d	dej                  deej                     deej                     deej                     def
dZ	 xZ
S )
SpeechT5EncoderLayerrz   c                    t         |           t        |j                  |j                  |j
                  d      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        ||j                        | _        t        j                  |j                  |j                        | _        y )NF)r  r  r   r  r$  )rl   rm   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r&  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr*  s     r*   rm   zSpeechT5EncoderLayer.__init__	  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r,   r   r/   r  r  r  c                     |}| j                  |||||      \  }}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r/   r  r  r  )r  r   r   r  r  )	ry   r   r/   r  r  r  residualr  outputss	            r*   r   zSpeechT5EncoderLayer.forward  s    . !&*nn')+'/ '5 '
#| ]3 =06%(9(9-(HH--m< "&Gr,   )NNNF)r   r   r   r   rm   r   r   r   rO   r   r   r   s   @r*   r  r    ss    \~ \  262604"',||, !., "%,,/	,
  -,  ,r,   r  c                   ,    e Zd Zddef fdZ	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	ee   d
ee	   dee	   deej                     fdZ
 xZS )SpeechT5DecoderLayerrz   c                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j
                  d|      | _        t        j                  |j                  |j                        | _        t!        ||j"                        | _        t        j                  |j                  |j                        | _        y )NT)r  r  r   r  r  r$  )r   r  r  )rl   rm   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r&  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr  r  )ry   rz   r  r|   s      r*   rm   zSpeechT5DecoderLayer.__init__F  s    *((44,,
 zz&"7"78$&LL1C1CI^I^$_!-**,,
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r,   r   r/   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  r  	use_cacher  c           	      ~   |}| j                  ||||||
      \  }}| j                  |      }||z   }| j                  |      }d}|D|}| j                  |||||||
      \  }}| j                  |      }||z   }| j	                  |      }|| j                  |      z   }| j                  |      }|f}|r|||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r  r/   r  r  r  N)r   r  r/   r  r  r  r  )r  r   r  r  r   r  r  )ry   r   r/   r  r  r  r  r  r  r  r  r  self_attn_weightscross_attn_weightsr  s                  r*   r   zSpeechT5DecoderLayer.forward^  s   > ! ,0>>'))+/) ,: ,
(( ]3 =011-@ " ,$H040A0A+!65 :-"3- 1B 1-M- !LL7M$}4M 88GM &(9(9-(HH--m< ")+=>>Gr,   r~   )	NNNNNNFTN)r   r   r   r   rm   r   r   r   r   rO   r   r   r   s   @r*   r  r  E  s    \~ \6 268<9=26=A*.,1$(15I||I !.I  (5	I
 !) 6I "%,,/I %-U\\$:I !I $D>I D>I !.Ir,   r  c                   D    e Zd ZU eed<   dZdZdZdej                  fdZ
y)SpeechT5PreTrainedModelrz   speecht5r-   Tmodulec           
      F   | j                   j                  }t        |t              rt        j
                  j                  |j                  j                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j
                  j                  |j                  j                  d       nt        |t              r'|j                   j"                  j%                  d       nt        |t&              rt        j                  d|j(                  j*                  z        }t        j
                  j-                  |j(                  j                  | |       t        j
                  j-                  |j(                  j                  | |       n/t        |t        j.                        rZ|j                  j"                  j                  d|       |j                  |j                  j"                  j1                          nt        |t        j2                  t        j4                  t        j6                  f      rK|j                  j"                  j1                          |j                  j"                  j%                  d       n7t        |t        j8                        rt        j
                  j;                  |j                         |j                  t        j                  |j<                  |j                  |j                  d   z  z        }t        j
                  j-                  |j                  | |       n~t        |t        j>                        rd|j                  j"                  j                  d|       |j@                  1|j                  j"                  |j@                     j1                          tC        |d	      r*t        j
                  j-                  |jD                         yy)
zInitialize the weightsr   r   r   meanstdr   )abr2   Nr5  )#rz   initializer_ranger  r   r   initnormal_ru   r   r   r  ri   in_channels	constant_rk   r   r   datafill_r"  r(  in_featuresr4  r'  zero_r   r   r~  rq   kaiming_normal_r   r  r   r   r5  )ry   r  r  ks       r*   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s   kk++f=>GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 @ALL##C( 9:		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CS&9{{&  &&(r||R^^ LMKK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8-MM&&CS&9!!-""6#5#56<<>6./GGV556 0r,   N)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler   r,   r*   r
  r
    s)    "$O&*#"7BII "7r,   r
  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    rz   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t#        |j                  |j$                  z  |j&                        | _        d| _        | j-                          y c c}w )Nr$  F)rl   rm   r   r   r   r&  r   r   r  r   encoder_layerdrop	layerdropr  rM   encoder_layersr  rd  r   r  encoder_max_relative_positionr  r  	post_init)ry   rz   r\   r|   s      r*   rm   zSpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$ha%9&%A$hiA&"@"@@&BfBf 
 ',# 	 %is   Dr   r/   	head_maskr  output_hidden_statesreturn_dictr8   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        ||j
                        }| j                  |      }| j                  |      }| j                  |      }t               xs t        |       }|rdnd}	|rdnd}
|_|j                         d   t        | j                        k7  r6t        dt        | j                         d|j                         d    d      t        | j                        D ]l  \  }}|r|	|fz   }	d}| j                   r$t#        j$                  g       }|| j&                  k  }|r|r |||||||   nd|      }|d   }|rd	}|sd|
d
   fz   }
n |r|	|fz   }	|st)        d ||	|
fD              S t+        ||	|
      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr$  r   z&The head_mask should be specified for  layers, but it is for .F)r/   r  r  r  r[  r   c              3   &   K   | ]	  }||  y wr~   r$  .0vs     r*   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>L  s     mq_`_lms   last_hidden_stater   
attentions)rz   r  r.  use_return_dictr   rD   r   r   r  r   r   r   rR   rd  r'   	enumerater  r   rH   r)  r  r   )ry   r   r/   r-  r  r.  r/  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r*   r   zSpeechT5Encoder.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %7H[H[\N6]3,,];02R6LT6R"6BD$5b4  ~~"c$++&66 <S=M<N O!(+,A/ 
 #,DKK"8 	PC#$58H$H! #N}}&+jjn#!4t~~!E![ -!#1"/7@7LYs^RV&7! !.a 0 , &9]1=M<O&O#3	P6   1]4D Dm]4EGZ$[mmm++*
 	
r,   NNNNNr   r   r   r   r   rm   r   r]  r   r   rO   r   r  r   r   r   r   s   @r*   r&  r&    s    ~ ( 26,0,0/3&*f
((f
 !.f
 ELL)	f

 $D>f
 'tnf
 d^f
 
uo%	&f
r,   r&  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r.  prenetr&  wrapped_encoderr,  r*  s     r*   rm   z(SpeechT5EncoderWithSpeechPrenet.__init__[  5     1&9.v6 	r,   r-   r/   r-  r  r.  r/  r8   c                 ^    | j                  ||      \  }}| j                  ||||||      }|S N)r   r/   r-  r  r.  r/  rJ  rK  	ry   r-   r/   r-  r  r.  r/  r   r  s	            r*   r   z'SpeechT5EncoderWithSpeechPrenet.forwardc  sG     )-L.(Q%~&&')/!5# ' 
 r,   rE  rF  r   s   @r*   rH  rH  U  s    
~  26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	&r,   rH  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 	 ddej                  de
ej                     de
ej                     d	e
e   d
e
e   de
e   deeef   fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r  rJ  r&  rK  r,  r*  s     r*   rm   z&SpeechT5EncoderWithTextPrenet.__init__  5     /7.v6 	r,   c                 6    | j                   j                         S r~   rJ  get_input_embeddingsr:  s    r*   rW  z2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {{//11r,   c                 :    | j                   j                  |       y r~   rJ  set_input_embeddingsry   values     r*   r[  z2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/r,   r-   r/   r-  r  r.  r/  r8   c                 V    | j                  |      }| j                  ||||||      }|S rN  rO  rP  s	            r*   r   z%SpeechT5EncoderWithTextPrenet.forward  s@     L1&&')/!5# ' 
 r,   rE  )r   r   r   r   r   rm   rW  r[  r   r]  r   r   rO   r   r  r   r   r   r   s   @r*   rR  rR  z  s    ~ 20 26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	&r,   rR  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    rz   c                 d    t         |   |       t        |      | _        | j	                          y r~   )rl   rm   r&  rK  r,  r*  s     r*   rm   z%SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r,   r-   r/   r-  r  r.  r/  r8   c                 0    | j                  ||||||      S rN  )rK  )ry   r-   r/   r-  r  r.  r/  s          r*   r   z$SpeechT5EncoderWithoutPrenet.forward  s.     ##&)/!5# $ 
 	
r,   rE  rF  r   s   @r*   ra  ra    s    
~  26,0,0/3&*
''
 !.
 ELL)	

 $D>
 'tn
 d^
 
uo%	&
r,   ra  c                   v    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eeej                        dee   dee   dee   dee   deej                     deeef   fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    rz   c           	      
   t         |   |       |j                  | _        t	        j
                  t        |j                        D cg c]  }t        ||       c}      | _	        d| _
        | j                          y c c}w )N)r  F)rl   rm   decoder_layerdropr)  r   r  rM   decoder_layersr  rd  r  r,  rj  s      r*   rm   zSpeechT5Decoder.__init__  sh     11mmX]^d^s^sXt$uST%9&A%N$uv&+# 	 %vs   B r   r/   r  r  r-  cross_attn_head_maskr  r  r  r.  r/  r  r8   c                 d   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|j                         dd }| j                  r%| j                  r|rt        j                  d       d}d}|r<t        |t              s,t        j                  d       d}t        j                  |      }||j                         nd}t        ||||      }||t!        ||j"                  |d         }t%               xs t'        |       }|
rd	nd}|	rd	nd}|	r|d	nd}t)        ||gd
dg      D ]j  \  }}|	|j                         d   t+        | j,                        k7  s3t/        d| dt+        | j,                         d|j                         d    d       t1        | j,                        D ]  \  }}|
r||fz   }d}| j                  r$t3        j4                  g       }|| j6                  k  }|r|sE ||||||||   nd|||   nd||	||
      }|d   }|	sm||d   fz   }|y||d   fz   } |
r||fz   }|r|j9                         }|st;        d |||||fD              S t=        |||||      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr#   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )r  r$  r-  rk  zThe `z` should be specified for r1  r2  )r  r  r  r  r  r  r  r   r   c              3   $   K   | ]  }|| 
 y wr~   r$  r4  s     r*   r7  z*SpeechT5Decoder.forward.<locals>.<genexpr>{  s      = s   )r9  r  r   r:  cross_attentions)rz   r  r.  r  r;  r   r  r  loggerwarning_oncer  r   r   from_legacy_cacher  r   r   rD   r   r   rQ  rR   rd  r'   r<  r   rH   r)  to_legacy_cacher  r   )ry   r   r/   r  r  r-  rk  r  r  r  r.  r/  r  r  return_legacy_cacher   r=  r>  r?  all_cross_attentions	attn_mask	mask_namer@  decoder_layerrB  rC  rD  s                              r*   r   zSpeechT5Decoder.forward  sV   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]#((*3B/&&4==##p "	#Z?\
 #'1CCOTOETE`!?!?!Afg:K8N

 !,1G1S%?&(;(;[QS_&" 12R6LT6R #7BD$5b4&7<Q<]rdh %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 	VC#$58H$H! #N}}&+jjn#!4t~~!Ek)%'=3<3H3dI]Ii,@,Eos."3#-M *!,M &9]1=M<O&O#(4+?=QRCSBU+U(;	V>   1]4D D-==?O ':KM`bvw   9+++*1
 	
r,   NNNNNNNNNNNNr   r   r   r   r   rm   r   r   r]  r\  r   listrO   r   r  r   r   r   r   s   @r*   rg  rg    s?   	~ 	 6:59=A=A,07;=A$(,0/3&*15s
 1 12s
 !!1!12s
  ((9(9:	s

 !))9)9 :s
 ELL)s
 'u||4s
 "$u'8'8"9:s
 D>s
 $D>s
 'tns
 d^s
 !.s
 
u??	@s
r,   rg  c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deeej                        dee   dee   dee   dee   deej                     deeef   fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r_  rJ  rg  wrapped_decoderr,  r*  s     r*   rm   z(SpeechT5DecoderWithSpeechPrenet.__init__  rL  r,   r-   r/   r  r  rr  r-  rk  r  r  r  r.  r/  r  r8   c                 d    | j                  ||      }| j                  ||||||||	|
|||      }|S N)r   r/   r  r  r-  rk  r  r  r  r.  r/  r  rJ  r~  )ry   r-   r/   r  r  rr  r-  rk  r  r  r  r.  r/  r  decoder_hidden_statesr  s                   r*   r   z'SpeechT5DecoderWithSpeechPrenet.forward  sV      !%L:L M&&/)"7#9!5+/!5#) ' 
 r,   )NNNNNNNNNNNNNry  r   s   @r*   r|  r|    sG   
~  5959=A=A59,07;=A$(,0/3&*15!u001! !!1!12!  ((9(9:	!
 !))9)9 :! %U\\2! ELL)! 'u||4! "$u'8'8"9:! D>! $D>! 'tn! d^! !.! 
u??	@!r,   r|  c                       e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     deee	j                        dee   dee   dee   dee   dee	j                     deeef   fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r  rJ  rg  r~  r,  r*  s     r*   rm   z&SpeechT5DecoderWithTextPrenet.__init__  rT  r,   c                 6    | j                   j                         S r~   rV  r:  s    r*   rW  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  rX  r,   c                 :    | j                   j                  |       y r~   rZ  r\  s     r*   r[  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  r^  r,   r-   r/   r  r  r-  rk  r  r  r  r.  r/  r  r8   c                 l    | j                  |||      \  }}| j                  |||||||||	|
||      }|S r  r  )ry   r-   r/   r  r  r-  rk  r  r  r  r.  r/  r  r  r  s                  r*   r   z%SpeechT5DecoderWithTextPrenet.forward  s]     15L.Zi0j-~&&/)"7#9!5+/!5#) ' 
 r,   rx  )r   r   r   r   r   rm   rW  r[  r   r   r]  r\  r   rz  rO   r   r  r   r   r   r   s   @r*   r  r    s;   ~ 20
 5959=A=A,07;=A$(,0/3&*15 u001  !!1!12   ((9(9:	 
 !))9)9 :  ELL)  'u||4  "$u'8'8"9:  D>  $D>  'tn  d^  !.  
u??	@ r,   r  c                   v    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eeej                        dee   dee   dee   dee   deej                     deeef   fdZ xZS )SpeechT5DecoderWithoutPrenetrb  rz   c                 d    t         |   |       t        |      | _        | j	                          y r~   )rl   rm   rg  r~  r,  r*  s     r*   rm   z%SpeechT5DecoderWithoutPrenet.__init__  rd  r,   r-   r/   r  r  r-  rk  r  r  r  r.  r/  r  r8   c                 @    | j                  |||||||||	|
||      }|S r  )r~  )ry   r-   r/   r  r  r-  rk  r  r  r  r.  r/  r  r  s                 r*   r   z$SpeechT5DecoderWithoutPrenet.forward  sD     &&&)"7#9!5+/!5#) ' 
 r,   rx  ry  r   s   @r*   r  r    s1   
~  5959=A=A,07;=A$(,0/3&*15u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ !. 
u??	@r,   r  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ
d	 Zed
        Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
    rz   c                 f    t         |           |j                  | _        |j                  | _        y r~   )rl   rm   guided_attention_loss_sigmasigmaguided_attention_loss_scalescaler*  s     r*   rm   z-SpeechT5GuidedMultiheadAttentionLoss.__init__%  s(    77
77
r,   r:  input_masksoutput_masksr8   c                 F   | j                  |||j                        }|j                  d      |j                  d      z  }|j                  |j                        j                  d      }||z  }t	        j
                  |j                  |            }| j                  |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r#   r   r   )_make_guided_attention_masksr   r   r   r   r  masked_selectr  )ry   r:  r  r  guided_attn_masksmaskslosseslosss           r*   r   z,SpeechT5GuidedMultiheadAttentionLoss.forward*  s    " !==k<YcYjYjk&&r*[-B-B2-FF**+55a8"Z/zz&..u56zzD  r,   c                 r   |j                  d      }|j                  d      }t        j                  t        |      |j                  d   |j                  d   f|      }t        t        ||            D ]0  \  }\  }}	| j                  ||	| j                  |      ||d |	d |f<   2 |j                  d      S )Nr#   r   rF  )
rK   r   rN   rR   r%   r<  rQ  _make_guided_attention_maskr  r   )
ry   r  r  r   r]   rJ  r  r@  ilenolens
             r*   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksC  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}n+M!N 	tC$373S3STXZ^`d`j`jlr3sc5D5%4%/0	t !**1--r,   c                 (   t        j                  t        j                  | |      t        j                  ||      d      \  }}|j                         |z  }|j                         | z  }dt        j                  ||z
  dz   d|dz  z  z        z
  S )NrF  xy)indexingr   r   )r   meshgridrQ   r   r   )r>   output_lengthr  r   grid_ygrid_xs         r*   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_maskN  s    LLf5LLv6

 -/,.UYY&6/a!78ANKLLLr,   )r   r   r   r   r   rm   r   r]  
BoolTensorr   r   r  r   r  r   r   s   @r*   r  r    sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr,   r  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dej                  dej                  d	e	ej                     d
ej                  fdZ xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    rz   c                 (   t         |           |j                  | _        |j                  | _        |j                  | _        t               | _        t        t        j                  d            | _
        | j                  rt        |      | _        y y )Ng      @)
pos_weight)rl   rm   use_guided_attention_lossguided_attention_loss_num_headsr.   r   l1_criterionr   r   r   bce_criterionr  attn_criterionr*  s     r*   rm   z SpeechT5SpectrogramLoss.__init___  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r,   r/   r  r  r  labelsrn  r8   c           	      V   |dk7  }|j                  |      }|j                  |      }|j                  |      }| j                  ||      | j                  ||      z   }|d d d d df   }	t        j                  |	 dz  t        j                  |	j                  d      d      j                  |	j                        gd      }
|
d d dd f   j                  |	      }
|j                  |	      }| j                  ||
      }||z   }| j                  rt        j                  |D cg c]  }|d d d | j                  f    c}d      }|dk(  }|d d d d df   }| j                  dkD  r#|d d | j                  dz
  d | j                  f   }| j                  |||      }||z  }|S c c}w )Nr1   r   r   r   r   )r  r  r   r   rT   r   r   r   r  r  r  r.   r  )ry   r/   r  r  r  r  rn  rB  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r*   r   zSpeechT5SpectrogramLoss.forwardk  s    ' %%l3!7!E!El!S 5 C CL Q ##$96BTEVEVWmouEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%fk: ! ))99Tdeqa#IT%I%I#I IJeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D+|LIID fs   $F&r~   )r   r   r   r   r   rm   r   r\  r]  r   r   r   r   r   s   @r*   r  r  Z  s    
O~ 
O& 9=)(() !& 1 1)  %00	)
 !!) !!) #5#4#45) 
)r,   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            $       h    e Zd Z	 	 ddedeej                     deej                     f fdZd Zd Z	d Z
d Zd	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                      deej                     deej                      deej"                     deej"                     deej                     deeeej"                           deeeej"                           dee   deej"                     dee   dee   dee   deej                     deeej"                     ef   f d       Z xZS )SpeechT5Modelrz   encoderdecoderc                     t         |   |       || _        |t        |      n|| _        |t        |      n|| _        | j                          y)z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)rl   rm   rz   ra  r  r  r  r,  )ry   rz   r  r  r|   s       r*   rm   zSpeechT5Model.__init__  sM     	 ?F3F;T[?F3F;T[ 	r,   c                     t        | j                  t              r| j                  j                         S t        | j                  t
              r| j                  j                         S t        r~   )r  r  rR  rW  r  r  NotImplementedErrorr:  s    r*   rW  z"SpeechT5Model.get_input_embeddings  sL    dll$AB<<4466dll$AB<<4466!!r,   c                     t        | j                  t              r| j                  j                  |       t        | j                  t
              r| j                  j                  |       y y r~   )r  r  rR  r[  r  r  r\  s     r*   r[  z"SpeechT5Model.set_input_embeddings  sJ    dll$ABLL--e4dll$ABLL--e4 Cr,   c                     | j                   S r~   )r  r:  s    r*   get_encoderzSpeechT5Model.get_encoder      ||r,   c                     | j                   S r~   )r  r:  s    r*   get_decoderzSpeechT5Model.get_decoder  r  r,   c                     t        | j                  t              r%| j                  j                  j	                          yyz
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rH  rJ  r;  r:  s    r*   r;  z$SpeechT5Model.freeze_feature_encoder  s/    
 dll$CDLL668 Er,   r-   r/   decoder_input_valuesdecoder_attention_maskr-  decoder_head_maskrk  encoder_outputsr  r  rr  r  r.  r/  r  r8   c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|Qt        | j
                  t              r7| j
                  j                  j                  |d   j                  d   |      }n|}t        | j                  t              rd|i}ni } | j                  d
|||d   ||||	|
||||d|}|s||z   S t        |j                   |j"                  |j$                  |j&                  |j(                  |j                   |j$                  |j&                  	      S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r-   r/   r-  r  r.  r/  r   r   r   r8  rr  )r-   r/   r  r  r-  rk  r  r  r  r.  r/  r  )r9  r  r  decoder_attentionsrn  encoder_last_hidden_stater  encoder_attentionsr$  )rz   r  r.  r  r;  r  r  r   rR   rH  rJ  r>  r%   r  r|  r   r9  r  r   r:  rn  )ry   r-   r/   r  r  r-  r  rk  r  r  r  rr  r  r.  r/  r  r  decoder_argsdecoder_outputss                      r*   r   zSpeechT5Model.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c%)\\%8%8%[%["((+^&" &4"dll$CD02DELL&$,, 
-1"1!"4#9'!5+/!5#)
 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r,   r[  NNNNNNNNNNNNNNN)r   r   r   r   r   r   r#  rm   rW  r[  r  r  r;  r   r   r   r\  r]  r  rO   r   r   r   r   r   s   @r*   r  r    s    (,'+	 "))$ "))$	("59  04597;=A159=7;EIEI$(:>,0/3&*15!k
u||,k
 !!1!12k
 'u||4	k

 !))9)9 :k
 E--.k
 $E$5$56k
 'u||4k
 "%e.?.?(@"ABk
 "%e.?.?(@"ABk
 D>k
 %U%6%67k
 $D>k
 'tnk
 d^k
  !.!k
" 
uU&&');;	<#k
 k
r,   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            $           e Zd ZdgZdef fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                      deeeej                           deeeej                           dee   dee   dee   dee   deej                     deej                      deeef   f d       Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightrz   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rl   rm   r  r'   r|   rH  r  r  r  r  text_decoder_postnetr,  )ry   rz   speech_encodertext_decoderr|   s       r*   rm   z SpeechT5ForSpeechToText.__init__C  s     $00@ A/ /  9@4V<%fnlK$>v$F! 	r,   c                 6    | j                   j                         S r~   r  r  r:  s    r*   r  z#SpeechT5ForSpeechToText.get_encoderW      }}((**r,   c                 6    | j                   j                         S r~   r  r  r:  s    r*   r  z#SpeechT5ForSpeechToText.get_decoderZ  r  r,   c                 T    | j                         j                  j                          yr  r  rJ  r;  r:  s    r*   r;  z.SpeechT5ForSpeechToText.freeze_feature_encoder]      
 	!!88:r,   c                 6    | j                   j                         S r~   )r  r  r:  s    r*   r  z-SpeechT5ForSpeechToText.get_output_embeddingsd  s    ((>>@@r,   c                 :    | j                   j                  |       y r~   )r  r  r  s     r*   r  z-SpeechT5ForSpeechToText.set_output_embeddingsg  s    !!77Gr,   r-   r/   decoder_input_idsr  r-  r  rk  r  r  r  r  r.  r/  r  r  r8   c                    ||n| j                   j                  }|7|5t        || j                   j                  | j                   j                        }| j                  |||||||||	|
||d|      }| j                  |d         }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                   |j"                  	      S )a(  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r-   r/   r  r  r-  r  rk  r  r  r  r  r.  r/  r  r   r#   r   )	r  r  r  r  r  rn  r  r  r  )rz   r;  r+   r    r!   r  r  r   r   r  r   r  r  r  rn  r  r  r  )ry   r-   r/   r  r  r-  r  rk  r  r  r  r  r.  r/  r  r  r  r  r  loss_fctoutputs                        r*   r   zSpeechT5ForSpeechToText.forwardj  s]   v &1%<k$++B]B] ($6DKK44dkk6X6X%! --%)!2#9/!5++/!5)   
" **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   r  )r   r   r   _tied_weights_keysr   rm   r  r  r;  r  r  r   r   r   r]  r\  r   r  rO   r   r   r   r   r   s   @r*   r  r  ;  s    @@~ (++;AH  59598<=A159=7;EIEI$(,0/3&*-115!H
u001H
 !!1!12H
 $E$4$45	H

 !))9)9 :H
 E--.H
 $E$5$56H
 'u||4H
 "%e.?.?(@"ABH
 "%e.?.?(@"ABH
 D>H
 $D>H
 'tnH
 d^H
 ))*H
  !.!H
" 
uo%	&#H
 H
r,   r  modelrr  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
      j   |t        d      |+d|| j                  j                  k(  j                         z
  }
n|}
|j	                  d      }| j
                  j                  ||
d      }|j                  }t        | j
                  j                  t              r@| j
                  j                  j                  j                  |d   j                  d   |
      }
t        |j	                  d      |z  | j                  j                  z        }t        |j	                  d      |z  | j                  j                  z        }|j                  |d| j                  j                        }g }g }d }d}i }	 |dz  }| j
                  j                   j                  ||      }| j
                  j                   j#                  |d d dd f   d ||
|d|d      }|r0|j%                  t'        j(                  |j*                  d             |j                  j-                  d      }|j.                  }| j0                  j3                  |      }|j5                  || j                  j                  | j                  j                        }|j%                  |       |d d dd d f   j5                  |d| j                  j                        }t'        j(                  ||fd      }t'        j6                  | j0                  j9                  |            }||k  r||k  rAt'        j:                  |d      |k\  }t'        j<                  |      d   j?                         }ntA        tC        |            }|D cg c]	  }||vs| }}tC        |      dkD  rat'        jD                  |      }|jG                  dd      jI                  dd	      }| j0                  jK                  |      }|D ]
  } ||    || <    tC        |      |k\  rntA        tC        |            D cg c]  }||   	 }}|	s|dk(  r|d   n4t&        jL                  jN                  jP                  jS                  |d
      }|	 ||      }!n|}!|r`t'        j(                  |d	      }|dkD  r@ |j4                  |t        |j	                  d      |z        g|j	                         dd   }|!|f}!|!S g }"tA        |      D ]%  }|"j%                  ||   j	                  d             ' |:t&        jL                  jN                  jP                  jS                  |d
      }||"f}!nyg }#t&        jL                  jN                  jP                  jS                  |d
      } ||      }#|"D cg c]+  }t        |#j	                  d      tU        |"      z        |z  - }$}|#|$f}!|r^t'        j(                  |d	      } |j4                  |t        |j	                  d      |z        g|j	                         dd   }g |!|}!|!S c c}w c c}w c c}w )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r-   r/   r/  r#   )r   r/   r  r  r  r  r  r/  r   r   )batch_first)+r'   rz   r    r<   r   r  r  r9  r  rH  rJ  r>  r%   r.   r$   rb  r  r~  rV   r   r   rn  squeezer  speech_decoder_postnetr  r   sigmoidr  rK   rn  rL   rM   rR   stackr   flattenr  r   r   rnnpad_sequencer=   )%r  r-   rr  r/   r  r  r  r  r  r  r  r   encoder_outr  maxlenminlenoutput_sequencespectrogramrn  r  r@  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr  spectrograms
meet_indexr  spectrogram_lengths	waveformswaveform_lengthss%                                        r*   _generate_speechr    s    !
 	
 !"lell6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ell&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S!q@R7RASLS< 1$${{;7+55a;CCAqI$;;CCLQ". NJ5A*5M&z2N%&#-i j 49=O9P3QRa&q)RLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#8#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !s 	@A&&|A';';A'>?	@? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rsZ[INN1$5<O8P$P QTU Uss "23G"$yy)9qA4/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   4	X&>X&X+0X0zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            (           e Zd ZdZdef fdZedefd       Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                      deeeej                           deeeej                           dee   dee   dee   dee   deej                     deej                     deej                      deej                      deeef   f$d       Z ej*                         	 	 	 	 	 	 	 	 d"dej                  deej                     deej                     dedededeej0                     dededeej                  eej                  ej                  f   f   fd       Z ej*                         	 	 	 	 	 	 	 	 d"dej                  deej                     deej                     dedededeej0                     dededeej                  eej                  ej                  f   f   fd        Z xZS )#SpeechT5ForTextToSpeechr   rz   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rl   rm   r  r'   r|   rR  r|  r  r  r  r  r,  )ry   rz   text_encoderspeech_decoderr|   s       r*   rm   z SpeechT5ForTextToSpeech.__init__	  s     $00@ A/ /  5V<8@%flNK&B6&J# 	r,   r8   c                      yr  r$  )clss    r*   can_generatez$SpeechT5ForTextToSpeech.can_generate	  s    
 r,   c                 6    | j                   j                         S r~   r  r:  s    r*   r  z#SpeechT5ForTextToSpeech.get_encoder	  r  r,   c                 6    | j                   j                         S r~   r  r:  s    r*   r  z#SpeechT5ForTextToSpeech.get_decoder	  r  r,   r/   r  r  r-  r  rk  r  r  r  r  r.  r/  rr  r  r  r  c                 d   ||n| j                   j                  }|>|$t        || j                   j                  |      \  }}| j                   j                  rd}| j                  |||||||||	|
|||d|      }| j                  |d         \  }}}d}|,t        | j                         } |||||||j                        }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr-   r/   r  r  r-  r  rk  r  r  r  rr  r  r.  r/  r  r   r   	r  r  r  r  r  rn  r  r  r  )rz   r;  r4   r.   r  r  r  r  rn  r   r  r  r  r  r  r  )ry   r   r/   r  r  r-  r  rk  r  r  r  r  r.  r/  rr  r  r  r  r  r  r  r  r  	criterionr  s                            r*   r   zSpeechT5ForTextToSpeech.forward	  s   Z &1%<k$++B]B]#+?WDKK88:P@<$&< {{44$(!--")!5#9/!5++1/!5)   
$ AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   r  r  r  r  r  r  c
                     |W|j                  d      }|j                  d      |k7  r2|j                  d      dk(  r|j                  |d      }nt        d      t        | |||||||||	
      S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   rm  r'   r  )ry   r   r/   rr  r  r  r  r  r  r  kwargsr[   s               r*   generatez SpeechT5ForTextToSpeech.generate5
  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   c
                     |W|j                  d      }
|j                  d      |
k7  r2|j                  d      dk(  r|j                  |
d      }nt        d      t        | |||||||||	
      S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )ry   r   rr  r/   r  r  r  r  r  r  r[   s              r*   generate_speechz'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   NNNNNNNNNNNNNNNNNNNg      ?r2   g      4@NFF)r   r   r   r!  r   rm   classmethodrO   r  r  r  r   r   r   r\  r]  r   r  r   r   r   r   r   r   r#  r!  r#  r   r   s   @r*   r  r  	  s    "O~ ( T  ++  1559<@=A159=7;EIEI$(,0/3&*:>.2.215%D
E,,-D
 !!1!12D
 'u'8'89	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%e.?.?(@"ABD
 D>D
 $D>D
 'tnD
 d^D
 %U%6%67D
  **+!D
" ell+#D
$ !.%D
& 
u..	/'D
 D
L U]]_ 6::> !'+(-&+Y
##Y
 !!1!12Y
 %U%6%67	Y

 Y
 Y
 Y
 "))$Y
 "&Y
  $Y
 
u  %(9(95;L;L(L"MM	NY
 Y
v U]]_ ;?59 !'+(-&+]
##]
 %U%6%67]
 !!1!12	]

 ]
 ]
 ]
 "))$]
 "&]
  $]
 
u  %(9(95;L;L(L"MM	N]
 ]
r,   r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            (            e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	eee
j                           de	eee
j                           de	e   de	e   de	e   de	e   de	e
j                     de	e
j                     de	e
j                     de	e
j                     deeef   f$d       Z e
j&                         	 	 	 	 	 	 	 	 d!de
j                  de	e
j                     de	e
j                     dededede	ej,                     dedede
j                  fd       Z xZS )"SpeechT5ForSpeechToSpeechrz   c                     t         |   |       t        |      }t        |      }t	        |||      | _        t        |      | _        | j                          y r~   )	rl   rm   rH  r|  r  r  r  r  r,  )ry   rz   r  r  r|   s       r*   rm   z"SpeechT5ForSpeechToSpeech.__init__
  sM     8@8@%fnnM&B6&J# 	r,   c                 6    | j                   j                         S r~   r  r:  s    r*   r  z%SpeechT5ForSpeechToSpeech.get_encoder  r  r,   c                 6    | j                   j                         S r~   r  r:  s    r*   r  z%SpeechT5ForSpeechToSpeech.get_decoder  r  r,   c                 T    | j                         j                  j                          yr  r  r:  s    r*   r;  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder
  r  r,   r-   r/   r  r  r-  r  rk  r  r  r  r  r.  r/  rr  r  r  r  r8   c                    ||n| j                   j                  }|&|$t        || j                   j                  |      \  }}| j	                  |||||||||	|
|||d|      }| j                  |d         \  }}}d}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
            a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr  r   r   r  )rz   r;  r4   r.   r  r  r   r  r  r  rn  r  r  r  )ry   r-   r/   r  r  r-  r  rk  r  r  r  r  r.  r/  rr  r  r  r  r  r\   r  r  r  r  s                           r*   r   z!SpeechT5ForSpeechToSpeech.forward  s-   h &1%<k$++B]B]#+?WDKK88:P@<$&< --%)!5#9/!5++1/!5)   
$ "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   r  r  r  r  r  r  c
                 p    |!t        j                  d|j                        }t        | |||||||||	
      S )a'  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
                a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
                or the soundfile library (`pip install soundfile`).
                To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
                conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        )r   i   rF  )r   rN   r   r  )
ry   r-   rr  r/   r  r  r  r  r  r  s
             r*   r#  z)SpeechT5ForSpeechToSpeech.generate_speech  sM    T %!&Xl>Q>Q!R#!
 	
r,   r$  r%  )r   r   r   r   rm   r  r  r;  r   r   r   r]  r\  r   r  rO   r   r   r   r   r   r   r#  r#  r   r   s   @r*   r(  r(  
  s   
~ 
++;  5959<@=A159=7;EIEI$(,0/3&*:>.2.215%
u001
 !!1!12
 'u'8'89	

 !))9)9 :
 E--.
 $E$5$56
 'u||4
 "%e.?.?(@"AB
 "%e.?.?(@"AB
 D>
 $D>
 'tn
 d^
 %U%6%67
  **+!
" ell+#
$ !.%
& 
u..	/'
 
B U]]_ ;?59 !'+(-&+W
''W
 %U%6%67W
 !!1!12	W

 W
 W
 W
 "))$W
 "&W
  $W
 
		W
 W
r,   r(  c                   :     e Zd Zd fd	ZddZd Zd Zd Z xZS )HifiGanResidualBlockc                    t         |           || _        t        j                  t        t        |            D cg c]3  }t        j                  |||d||   | j                  |||               5 c}      | _	        t        j                  t        t        |            D cg c]-  }t        j                  |||dd| j                  |d            / c}      | _
        y c c}w c c}w )Nr   )rj   dilationr   )rl   rm   leaky_relu_sloper   r  rM   rR   rq   get_paddingconvs1convs2)ry   channelsri   r2  r3  r  r\   r|   s          r*   rm   zHifiGanResidualBlock.__init__  s     0mm s8}-
  		%a[ ,,[(1+F

 mm s8}-
  		 ,,[!<



s   8C$%2C)c                     ||z  |z
  dz  S r   r$  )ry   ri   r2  s      r*   r4  z HifiGanResidualBlock.get_padding  s    h&1a77r,   c                 ,   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]
  } ||        y Nr   )r   r   r   r   r   r5  r6  ry   r   r  s      r*   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  sp    hh**288,,m<((33??K[[ 	E	[[ 	E	r,   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]!  }t        j                  j                  |       # y r~   )r5  r   r   remove_weight_normr6  ry   r  s     r*   r>  z'HifiGanResidualBlock.remove_weight_norm  sL    [[ 	/EHH''.	/[[ 	/EHH''.	/r,   c                 ,   t        | j                  | j                        D ]p  \  }}|}t        j                  j                  || j                        } ||      }t        j                  j                  || j                        } ||      }||z   }r |S r~   )rQ  r5  r6  r   rt  
leaky_relur3  )ry   r   conv1conv2r  s        r*   r   zHifiGanResidualBlock.forward!  s    T[[9 	5LE5$HMM44]DDYDYZM!-0MMM44]DDYDYZM!-0M)H4M	5 r,   )r	   )r   r	      g?)r   )	r   r   r   rm   r4  r<  r>  r   r   r   s   @r*   r0  r0    s    
>8/r,   r0  z
    HiFi-GAN vocoder.
    c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
d Z ed	      dej                  d
ej                  fd       Z xZS )SpeechT5HifiGanrz   r  c                    t         |   |       t        |j                        | _        t        |j
                        | _        t        j                  |j                  |j                  ddd      | _        t        j                         | _        t        t        |j
                  |j                               D ]d  \  }\  }}| j                  j#                  t        j$                  |j                  d|z  z  |j                  d|dz   z  z  ||||z
  dz               f t        j                         | _        t)        t        | j                              D ]p  }|j                  d|dz   z  z  }t        |j                  |j*                        D ]6  \  }}| j&                  j#                  t-        ||||j.                               8 r t        j                  dddd      | _        | j3                  dt5        j6                  |j                               | j3                  dt5        j8                  |j                               | j;                          y )N   r   r	   )ri   rj   r   r   r  r  )rl   rm   rR   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rq   model_in_dimupsample_initial_channelconv_prer  	upsamplerr<  rQ  upsample_kernel_sizesrV   ConvTranspose1d	resblocksrM   resblock_dilation_sizesr0  r3  	conv_postr   r   rN   rT   r,  )ry   rz   r  upsample_rateri   r7  r2  r|   s          r*   rm   zSpeechT5HifiGan.__init__5  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r 		+A+{NN!!""331=33a!eE +((=8Q>		 s4>>*+ 	vA661Q<HH),V-I-I6KiKi)j v%X%%&:8[RZ\b\s\s&tuv	v
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r,   r  c                 2   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyy)zInitialize the weights.r2   r  N)r  r   rq   rR  r   r  r  rz   r  rk   r  )ry   r  s     r*   r  zSpeechT5HifiGan._init_weights[  sl    fryy"*<*<=>MM&&CT[[5R5R&S{{&  &&( ' ?r,   c                    t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  D ]
  } ||        | j                  D ]  }|j                            || j                         y r:  )
r   r   r   r   r   rO  rP  rS  r<  rU  r;  s      r*   r<  z!SpeechT5HifiGan.apply_weight_normb  s    hh**288,,m<((33??KDMM"^^ 	E	^^ 	&E##%	&DNN#r,   c                 J   t         j                  j                  | j                         | j                  D ]!  }t         j                  j                  |       # | j
                  D ]  }|j                           t         j                  j                  | j                         y r~   )r   r   r>  rO  rP  rS  rU  r?  s     r*   r>  z"SpeechT5HifiGan.remove_weight_normn  sr    
##DMM2^^ 	/EHH''.	/^^ 	'E$$&	'
##DNN3r,   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r8   c                    | j                   j                  r|| j                  z
  | j                  z  }|j	                         dk(  }|s|j                  d      }|j                  dd      }| j                  |      }t        | j                        D ]  }t        j                  j                  || j                   j                        } | j                  |   |      } | j                  || j                   z     |      }t        d| j                         D ]*  }| | j                  || j                   z  |z      |      z  }, || j                   z  } t        j                  j                  |      }| j#                  |      }t%        j&                  |      }|s2|j)                  d      j                  dd      j+                  d      }|S |j)                  d      }|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r	   r   r   r   r#   )rz   normalize_beforer  r  r   r   r   rO  rM   rL  r   rt  rA  r3  rP  rS  rJ  rU  r   tanhr  r   )ry   r  
is_batchedr   r  	res_statejwaveforms           r*   r   zSpeechT5HifiGan.forwardv  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))* 	9AMM44]DKKD`D`aM-DNN1-m<M<q4+;+;';<]KI1d../ UET^^A0@0@,@1,DEmTT	U%(8(88M	9 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr,   )r   r   r   r   r  r!  rm   r   r#  r  r<  r>  r   r   r]  r   r   r   s   @r*   rF  rF  ,  sp     "!#O$4 $L)BII )
$4 (5#4#4 (9J9J ((r,   rF  )r  r(  r  r  r
  rF  )r   Nr  r%  )er   r   typingr   r   numpyrF   r   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   configuration_speecht5r   r   
get_loggerr   ro  _HIDDEN_STATES_START_POSITIONr   r<   r+   r4   r  r   r\  ndarrayrd   rf   r   r   r#  r   r   r   r   r   r  r"  r.  r_  ry  r  r  r  r  r  r  r  r  r
  r&  rH  rR  ra  rg  r|  r  r  r  r  r  r  r]  rO   r  r  r(  r0  rF  __all__r$  r,   r*   <module>rs     s      "     @ @ ! 5 ) @ 7 e 9  D , I 
		H	% !" %,, c [^ " ei0,,0250KSTYT`T`Ka04 26tc?tt t U--.	t
 t ZZtp#= ,!; 8!; 2A8BII A8J*bii *Zryy 0" "(299 %RYY %R1		 1D")) DN1")) 1h% %P<299 <2		+? ")-		+? )-X&,@ &$a2		 a2H")) 0:5 :zb5 bJ (7o (7 (7V|
- |
~"&= "J'$; 'T
#: 
@C
- C
L/&= /d3$; 3l*#: *Z8M299 8Mv:bii :z 
\
+ \

\
~ 
s
5 s

s
r 7;15#'$)"'L"L##L !!2!23L U--.	L
 L L L bii L "L  L 5eE$5$5u7H7H$HIIJL^ 
e
5 e

e
P 
t
 7 t

t
n;299 ;| 
to t
tnr,   