
    rh-G                     *   d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZ ddlmZ  ej>                  e       Z!e ed       G d de                    Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z(e G d de             Z)eZ* G d d e)e      Z+ ed!       G d" d#e)             Z, G d$ d%e      Z- G d& d'e      Z.g d(Z/y))zPyTorch UniSpeech model.    N)	dataclass)OptionalUnion   )ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   tupler         /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r   -   s     )-D(5$$
%,48hu0018>B):): ;B9=8E$5$56=8<M8E%"3"345<59Ju00129r*   r   c                       e Zd Zy) UniSpeechPositionalConvEmbeddingNr!   r"   r#   r)   r*   r+   r-   r-   J       r*   r-   c                       e Zd Zy)UniSpeechFeatureEncoderNr.   r)   r*   r+   r1   r1   N   r/   r*   r1   c                       e Zd Zy)UniSpeechFeatureProjectionNr.   r)   r*   r+   r3   r3   R   r/   r*   r3   c                       e Zd Zy)UniSpeechEncoderNr.   r)   r*   r+   r5   r5   V   r/   r*   r5   c                       e Zd Zy)UniSpeechEncoderStableLayerNormNr.   r)   r*   r+   r7   r7   Z   r/   r*   r7   c                   "    e Zd Zed        Zd Zy)UniSpeechGumbelVectorQuantizerc           	          | j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   dimgHz>)meanr%   expsumlog)probsmarginal_probs
perplexitys      r+   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexity_   sR    *YY		.599^VZEZ;[*[ac ddeiik
r*   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )Nr=   T)tauhardr;   r         ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr%   softmaxrE   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsr@   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrD   codevector_idxcodevectors_per_groupr[   s              r+   forwardz&UniSpeechGumbelVectorQuantizer.forwarde   s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r*   N)r!   r"   r#   staticmethodrE   re   r)   r*   r+   r9   r9   ^   s     
#'r*   r9   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej                  ef   fdZded	ej                  fd
Zy)UniSpeechPreTrainedModelconfig	unispeechinput_valuesTc           
      z   t        |t              r|j                  j                  j                  j                  dd       |j                  j                  j                  j                          t        j                  j                  |j                         yt        |t              rt        j                  j                  |j                  j                  ddt        j                  d|j                  j                   d   |j                  j"                  z  z        z         t        j                  j%                  |j                  j                  d       yt        |t&              rt        j                  d|j(                  j*                  z        }t        j                  j                  |j(                  j                  | |       t        j                  j                  |j(                  j                  | |       yt        |t        j,                        rm|j                  j                  j                  d| j.                  j0                         |j                  %|j                  j                  j                          yyt        |t        j2                  t        j4                  f      rJ|j                  j                  j                          |j                  j                  j7                  d       yt        |t        j8                        rt        j                  j;                  |j                         |j                  jt        j                  |j<                  |j"                  |j                   d   z  z        }t        j                  j                  |j                  | |       yyy)	zInitialize the weights        r   )r>   stdr   r   )abNrI   )
isinstancer9   rL   weightdatanormal_biaszero_rP   inituniform_r[   r-   convmathsqrtkernel_sizein_channels	constant_r3   
projectionin_featuresLinearri   initializer_range	LayerNorm	GroupNormfill_Conv1dkaiming_normal_groups)r]   moduleks      r+   _init_weightsz&UniSpeechPreTrainedModel._init_weights   s    f<=%%**222C##((..0GGV//0 @AGGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 :;		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r*   input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r%   div)input_lengthr|   strides      r+   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s"     99\K7wWZ[[[r*   )zipri   conv_kernelconv_stride)r]   r   r   r|   r   s        r+    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r*   feature_vector_lengthattention_maskc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr=   r;   r   )dtypedevicer   )r   )cumsumr   tor%   longrK   zerosr   r   arangeflipbool)r]   r   r   non_padded_lengthsoutput_lengthsr^   s         r+   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr*   N)r!   r"   r#   r   r'   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r%   
LongTensorintr   r   r)   r*   r+   rh   rh      sg    #$O&*#N9BeEDTDTVYDY>Z  ]b]m]m r*   rh   c                       e Zd ZdefdZd Zd Z	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dee   deeef   fdZy)UniSpeechModelri   c                    t         j                  |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nrm   )rh   __init__ri   r1   feature_extractorr3   feature_projectionmask_time_probmask_feature_probrP   	Parameterr%   Tensorr`   rx   masked_spec_embeddo_stable_layer_normr7   encoderr5   	post_init)r]   ri   s     r+   r   zUniSpeechModel.__init__   s     ))&1!8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r*   c                     t        d      NzNot needed for UniSpeechAttributeErrorr]   s    r+   freeze_feature_extractorz'UniSpeechModel.freeze_feature_extractor       788r*   c                     t        d      r   r   r   s    r+   freeze_feature_encoderz%UniSpeechModel.freeze_feature_encoder   r   r*   Nrk   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      \  }}| j                  |||      }| j                  |||||      }	|	d   }|s
||f|	dd z   S t        |||	j                  |	j                        S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r    )ri   r   r   use_return_dictr   	transposer   rK   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r    )
r]   rk   r   r   r   r   r   r   r   encoder_outputss
             r+   re   zUniSpeechModel.forward   s@    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*!#34qr7JJJ'+-)77&11	
 	
r*   )NNNNN)r!   r"   r#   r   r   r   r   r   r%   r   r&   r   r   r(   r   re   r)   r*   r+   r   r      s     "99 269=,0/3&*2
u||,2
 !.2
 $E$5$56	2

 $D>2
 'tn2
 d^2
 
u..	/2
r*   r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                   "    e Zd Zdef fdZdefdZd Zd Ze		 dde
j                  de
j                  d	e
j                  defd
       Ze	 	 	 	 ddee
j                     dee
j                     dee   dee   dee   deeef   fd       Z xZS )UniSpeechForPreTrainingri   c                 .   t         |   |       t        |      | _        t	        j
                  |j                        | _        t        |      | _	        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                         | _        t	        j
                  |j$                        | _        | j)                          y )N)superr   r   rj   rP   Dropoutfeat_quantizer_dropoutdropout_featuresr9   	quantizerr   codevector_dimproj_codevector_dim	project_qr`   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )r]   ri   	__class__s     r+   r   z UniSpeechForPreTraining.__init__+  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r*   rT   c                 &    || j                   _        y)zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rT   )r]   rT   s     r+   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature:  s     &1"r*   c                 X    t        j                  dt               | j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)warningswarnFutureWarningr   r   s    r+   r   z0UniSpeechForPreTraining.freeze_feature_extractor@  s'    
 	Q	

 	##%r*   c                 L    | j                   j                  j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rj   r   _freeze_parametersr   s    r+   r   z.UniSpeechForPreTraining.freeze_feature_encoderL  s    
 	((;;=r*   target_featuresnegative_featurespredicted_featuresc                     t        j                  | |gd      } t        j                  |j                         | j                         d      }|j	                  |       }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r;   r=   )r%   catcosine_similarityrS   rU   )r   r   r   rT   logitss        r+   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logitsS  sa      ))_6G$HaP(();)A)A)C_EZEZE\bde0 +%r*   rk   r   r   r   r   r   c                    ||n| j                   j                  }| j                  |||||      }|d   }| j                  |d         }| j	                  |      \  }	}
| j                  |	j                  | j
                  j                  j                              }	| j                  |	      }	t        j                  |j                  d      |j                  d            j                  | j                   j                        }|j                  dd      }t        j                   |      j#                         j                  |j$                        }|j                  dd      }|j'                  d      }|j)                  |d      |	j)                  | d      z   }| j+                  |      }| j-                  |      }d}|s||||	|
f|dd z   S ||	|
f|dd z   S t/        |||	|
|j0                  |j2                        S )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   r=   rm   r   )r   r   r   r   r   r    )ri   r   rj   r   r   r   r   rr   r   r   r%   emptysizer   replace_probr   	bernoullir   r   rZ   masked_fillr   r   r   r   r    )r]   rk   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr   r   s                  r+   re   zUniSpeechForPreTraining.forwardg  s   * &1%<k$++B]B]..)/!5# ! 
  'qz  00<48NNCS4T11 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 24FH]^ahijikalll(*<>STW^_`_aWbbb,1'9"7!//))
 	
r*   )r   )NNNN)r!   r"   r#   r   r   r   r   r   r   rf   r%   r&   r   r
   r   r   r   r   r(   r   re   __classcell__)r   s   @r+   r   r   %  s    1# 1
&> 
 	** ,, "-- 	 &  26,0/3&*D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 
u33	4D
 D
r*   r   c                       e Zd Zy)UniSpeechForCTCNr.   r)   r*   r+   r  r    r/   r*   r  c                       e Zd Zy)"UniSpeechForSequenceClassificationNr.   r)   r*   r+   r  r    r/   r*   r  )r  r   r  r   rh   )0r$   rz   r   dataclassesr   typingr   r   r%   torch.nnrP   modeling_outputsr   r   modeling_utilsr	   utilsr
   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr!   loggerr   r-   r1   r3   r5   r7   r9   rh   r   r   r   r  r  __all__r)   r*   r+   <module>r     sK      ! "   D - ,
 
 
 5 
		H	% 
:K : :.	'F 		4 		!: 		 		&D 	*'%B *'Z F F FR 3 J
-} J
Z 
B
6 B

B
J	n 		)J 	r*   