
    rh                     z   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+  e(       rddl,m-Z-  e)j\                  e/      Z0e e'd       G d de                    Z1 G d dejd                        Z3 G d dejd                        Z4 G d de      Z5 G d de      Z6 G d d e      Z7 G d! d"ejd                        Z8 G d# d$ejd                        Z9	 	 	 dSd%ejd                  d&e
jt                  d'e
jt                  d(e
jt                  d)ee
jt                     d*ee;   d+e;d,ee
jt                     fd-Z< G d. d/ejd                        Z= G d0 d1ejd                        Z> G d2 d3e      Z? G d4 d5ejd                        Z@ G d6 d7ejd                        ZA G d8 d9e      ZB G d: d;ejd                        ZC G d< d=ejd                        ZDe' G d> d?e#             ZE	 	 dTd@eFeGeGf   dAe;dBeGd)ee
j                     dCeGdDe	j                  fdEZJe ZKe' G dF dGeE             ZL e'dH       G dI dJeE             ZMdKZN e'dL       G dM dNeE             ZO e'dO       G dP dQeE             ZPg dRZQy)U    N)	dataclass)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )UniSpeechConfig)make_flex_block_causal_maskzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r!   r   torchFloatTensor__annotations__r"   r#   r$   r%   tupler&        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.pyr    r    :   s     )-D(5$$
%,48hu0018>B):): ;B9=8E$5$56=8<M8E%"3"345<59Ju00129r0   r    c                   $     e Zd Z fdZd Z xZS )UniSpeechSamePadLayerc                 P    t         |           |dz  dk(  rd| _        y d| _        y )N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__s     r1   r7   zUniSpeechSamePadLayer.__init__X   s)    #:Q#>!#Car0   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r8   r9   r%   s     r1   forwardzUniSpeechSamePadLayer.forward\   s6    ")!Q0F43F3F2F0F*FGMr0   r'   r(   r)   r7   r?   __classcell__r;   s   @r1   r3   r3   W   s    Kr0   r3   c                   $     e Zd Z fdZd Z xZS ) UniSpeechPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr5   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r6   r7   nnConv1dhidden_sizer:   num_conv_pos_embedding_groupsconvutilsrI   hasattrrN   r
   	deepspeedzeroGatheredParametersrK   	original0	original1weight_gweight_vregister_external_parameterr3   rG   r	   feat_extract_activation
activation)r9   configrI   rV   r[   r\   r;   s         r1   r7   z)UniSpeechPositionalConvEmbedding.__init__c   s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI,V-K-KL !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S )Nr   r5   )	transposerS   rG   r_   r>   s     r1   r?   z(UniSpeechPositionalConvEmbedding.forward   sV    %//15		-0]36%//15r0   r@   rB   s   @r1   rD   rD   b   s    ABr0   rD   c                   &     e Zd Zd fd	Zd Z xZS )UniSpeechNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   rF   stridebias)r6   r7   conv_dimin_conv_dimout_conv_dimrO   rP   conv_kernelconv_stride	conv_biasrS   r	   r^   r_   r9   r`   layer_idr;   s      r1   r7   z&UniSpeechNoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r0   c                 J    | j                  |      }| j                  |      }|S N)rS   r_   r>   s     r1   r?   z%UniSpeechNoLayerNormConvLayer.forward   s$    		-06r0   r   r@   rB   s   @r1   rd   rd      s    Ar0   rd   c                   &     e Zd Zd fd	Zd Z xZS )UniSpeechLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rf   T)elementwise_affine)r6   r7   ri   rj   rk   rO   rP   rl   rm   rn   rS   	LayerNorm
layer_normr	   r^   r_   ro   s      r1   r7   z$UniSpeechLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r0   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )N)rS   rb   ry   r_   r>   s     r1   r?   z#UniSpeechLayerNormConvLayer.forward   sV    		-0%//B76%//B76r0   rs   r@   rB   s   @r1   ru   ru      s    Ar0   ru   c                   &     e Zd Zd fd	Zd Z xZS )UniSpeechGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rf   T)
num_groupsnum_channelsaffine)r6   r7   ri   rj   rk   rO   rP   rl   rm   rn   rS   r	   r^   r_   	GroupNormry   ro   s      r1   r7   z$UniSpeechGroupNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr0   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rr   )rS   ry   r_   r>   s     r1   r?   z#UniSpeechGroupNormConvLayer.forward   s2    		-066r0   rs   r@   rB   s   @r1   r~   r~      s    r r0   r~   c                   .     e Zd ZdZ fdZd Zd Z xZS )UniSpeechFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )rp   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r6   r7   feat_extract_normr~   rangenum_feat_extract_layersrd   ru   
ValueErrorrO   
ModuleListconv_layersgradient_checkpointing_requires_grad)r9   r`   ir   r;   s       r1   r7   z UniSpeechFeatureEncoder.__init__   s    ##w.6vJKv==ABO .fq1uEO K %%0INvOmOmInDE+FQ?K  01I1I0JJst  ==5&+#"O
s   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y NF)
parametersrequires_gradr   r9   params     r1   _freeze_parametersz*UniSpeechFeatureEncoder._freeze_parameters   s(    __& 	(E"'E	(#r0   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S )NT)r   trainingr   r   )r9   input_valuesr%   
conv_layers       r1   r?   zUniSpeechFeatureEncoder.forward   sP    $QW- 4==*.M'** 	6J&}5M	6 r0   )r'   r(   r)   r*   r7   r   r?   rA   rB   s   @r1   r   r      s    8#($

r0   r   c                   $     e Zd Z fdZd Z xZS )UniSpeechFeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Nr|   eps)r6   r7   rO   rx   ri   layer_norm_epsry   LinearrQ   
projectionDropoutfeat_proj_dropoutdropoutr9   r`   r;   s     r1   r7   z#UniSpeechFeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r0   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS rr   )ry   r   r   )r9   r%   norm_hidden_statess      r1   r?   z"UniSpeechFeatureProjection.forward  s:    !__];(:;]3000r0   r@   rB   s   @r1   r   r      s    <1r0   r   modulequerykeyvalueattention_maskscalingr   	head_maskc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }	||	|z   }	t        j
                  j                  |	d      }	||	|j                  dddd      z  }	t        j
                  j                  |	|| j                        }	t        j                  |	|      }
|
j                  dd      j                         }
|
|	fS )Nr|         r5   r   rM   r   )pr   )sizer+   matmulrb   rO   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r1   eager_attention_forwardr     s     **R.D(<<s}}Q':;gEL!#n4==((2(>L#innQAq&AA==((6??([L,,|U3K''1-88:K$$r0   c                   H    e Zd ZdZ	 	 	 	 	 ddededededededee   f fd	Z		 	 	 	 dd
e
j                  dee
j                     dee
j                     dee
j                     dee   dee   dee
j                  ee
j                     eee
j                        f   fdZ xZS )UniSpeechAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderrh   	is_causalr`   c                 
   t         |           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rh   )r6   r7   r   r   r   head_dimr`   r   r   r   r   rO   r   k_projv_projq_projout_proj)	r9   r   r   r   r   rh   r   r`   r;   s	           r1   r7   zUniSpeechAttention.__init__0  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr0   r%   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                    |du}|j                   dd \  }}	|r|j                   d   n|	}
||	d| j                  f}||
d| j                  f} | j                  |      j                  | j	                  dd      }|r|n|} | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }t        }| j                  j                  dk7  rt        | j                  j                     } || ||||f| j                  sdn| j                  | j                  ||d|\  }}|j                  ||	d      j                         }| j!                  |      }||dfS )z#Input shape: Batch x Time x ChannelNr|   r   r5   eager        )r   r   r   r   )shaper   r   r   rb   r   r   r   r`   _attn_implementationr   r   r   r   reshaper   r   )r9   r%   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r1   r?   zUniSpeechAttention.forwardO  s    .T9 %**3B/W/A"((+wgr4==9wDMM: 7t{{=166FPPQRTUV-?)]5T[[055~FPPQRTUV
7t{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#w;FFHmmK0L$..r0   )r   FTFN)NNNF)r'   r(   r)   r*   intfloatboolr   r   r7   r+   Tensorr   r   r.   r?   rA   rB   s   @r1   r   r   -  s   G  ,0CC C 	C
 C C C )CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/r0   r   c                   $     e Zd Z fdZd Z xZS )UniSpeechFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y rr   )r6   r7   rO   r   activation_dropoutintermediate_dropoutr   rQ   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r1   r7   zUniSpeechFeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r0   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S rr   )r   r   r   r   r   r>   s     r1   r?   zUniSpeechFeedForward.forward  sX    //>00?11-@))-8++M:r0   r@   rB   s   @r1   r   r     s    @r0   r   c                   &     e Zd Z fdZddZ xZS )UniSpeechEncoderLayerc                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        y )NFr   r   r   r   r`   r   )r6   r7   r   rQ   num_attention_headsattention_dropout	attentionrO   r   r   r   rx   r   ry   r   feed_forwardfinal_layer_normr   s     r1   r7   zUniSpeechEncoderLayer.__init__  s    +((00,,
 zz&"7"78,,v'9'9v?T?TU08 "V-?-?VEZEZ [r0   c                     |}| j                  |||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S Nr   r   )r   r   ry   r   r   r9   r%   r   r   attn_residualr   _outputss           r1   r?   zUniSpeechEncoderLayer.forward  s    %)-.L] *8 *
&|Q ]3%56%(9(9-(HH--m< "&Gr0   r   r@   rB   s   @r1   r   r     s    \r0   r   c                        e Zd Z fdZ	 	 	 	 ddej
                  deej                     dededef
dZ	de
ej                  df   d	ej                  fd
Z xZS )UniSpeechEncoderc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        d| _        y c c}w Nr   F)r6   r7   r`   rD   pos_conv_embedrO   rx   rQ   r   ry   r   r   r   r   r   num_hidden_layersr   layersr   r9   r`   r  r;   s      r1   r7   zUniSpeechEncoder.__init__  s    >vF,,v'9'9v?T?TUzz&"7"78mmERXRjRjLk$lq%:6%B$lm&+# %m   !CNr%   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  ||      }| j	                  |      }	||	z   }| j                  |      }| j                  |      }t               xs t        |       }
| j                  D ]j  }|r||fz   }t        j                  g       }| j                  xr || j                  j                  k  }|r|
r ||||      }|d   }|rd}|sb|d   fz   }l |r||fz   }|st        d |||fD              S t!        |||	      S )
Nr/   r|   r   r5   r   r   NNc              3   &   K   | ]	  }||  y wrr   r/   .0vs     r1   	<genexpr>z+UniSpeechEncoder.forward.<locals>.<genexpr>        mq_`_lm   last_hidden_stater%   r&   )	unsqueezerepeatr   _update_full_maskr	  ry   r   r
   r   r  r+   randr   r`   	layerdropr.   r   r9   r%   r   r   r  r  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                  r1   r?   zUniSpeechEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001//

 #11-@%(;;6]302R6LT6R[[ 	PE#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ %!.Te! !.a 0 , &9]1=M<O&O#'	P*   1]4D Dm]4EGZ$[mmm++*
 	
r0   inputs_embedsc                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S Nflash_attention_2r   sdpaflex_attentionF)r   	r`   r   r   dtyper   r+   r   r   r   r9   r   r)  s      r1   r  z"UniSpeechEncoder._update_full_mask      
 %{{//3FF343F  MQ  11V; "E^UbUhUh!i  115EEnell;%@[`%aN
  "<NML_L_!`r0   NFFT)r'   r(   r)   r7   r+   tensorr   r   r   r?   r   r  rA   rB   s   @r1   r  r    s    , 26"'%* :
||:
 !.:
  	:

 #:
 :
xellD01 ||r0   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )UniSpeechAttnAdapterLayerc                    t         |           |j                  | _        |j                  | _        t        j                  | j
                        | _        t        j                  | j
                  | j                        | _
        t        j                         | _        t        j                  | j                  | j
                        | _        y)z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r6   r7   adapter_attn_dim	input_dimrQ   
hidden_dimrO   rx   normr   linear_1ReLUact_fnlinear_2r   s     r1   r7   z"UniSpeechAttnAdapterLayer.__init__  s    
 	00 ,,LL1			$//4>>Bggi		$..$//Br0   r%   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rr   )r;  r<  r>  r?  r>   s     r1   r?   z!UniSpeechAttnAdapterLayer.forward-  s@    		-0m4M2m4r0   )r'   r(   r)   r7   r+   r,   r?   rA   rB   s   @r1   r6  r6    s    CU%6%6 r0   r6  c                   f     e Zd Z fdZ	 	 ddej
                  deej
                     defdZ xZ	S )$UniSpeechEncoderLayerStableLayerNormc                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        t#        |dd       t%        |      | _        y d | _        y )NFr   r   r8  )r6   r7   r   rQ   r   r   r   rO   r   r   r   rx   r   ry   r   r   r   getattrr6  adapter_layerr   s     r1   r7   z-UniSpeechEncoderLayerStableLayerNorm.__init__8  s    +((00,,
 zz&"7"78,,v'9'9v?T?TU08 "V-?-?VEZEZ [6-t4@!:6!BD!%Dr0   r%   r   r   c                 $   |}| j                  |      }| j                  |||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }| j
                  || j                  |      z   }|f}|r||fz  }|S r   )ry   r   r   r   r   rE  r  s           r1   r?   z,UniSpeechEncoderLayerStableLayerNorm.forwardK  s     &6)-.L] *8 *
&|Q ]3%5%(9(9$:O:OP]:^(__))D,>,>},MMM "&Gr0   r   )
r'   r(   r)   r7   r+   r   r   r   r?   rA   rB   s   @r1   rB  rB  7  s>    &, 26"'	|| !.  	r0   rB  c                   p     e Zd Z fdZ	 	 	 	 ddZdeej                  df   dej                  fdZ xZ	S )UniSpeechEncoderStableLayerNormc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        d| _        y c c}w r  )r6   r7   r`   rD   r	  rO   rx   rQ   r   ry   r   r   r   r   r   r
  rB  r  r   r  s      r1   r7   z(UniSpeechEncoderStableLayerNorm.__init__f  s    >vF,,v'9'9v?T?TUzz&"7"78mmCHIaIaCbca1&9c
 ',# dr  Nc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  ||      }| j	                  |      }	||	z   }| j                  |      }t               xs t        |       }
| j                  D ]j  }|r||fz   }t        j                  g       }| j                  xr || j                  j                  k  }|r|
r ||||      }|d   }|rd}|sb|d   fz   }l | j                  |      }|r||fz   }|st        d |||fD              S t!        |||	      S )
Nr/   r|   r   r5   r   r   r  c              3   &   K   | ]	  }||  y wrr   r/   r  s     r1   r  z:UniSpeechEncoderStableLayerNorm.forward.<locals>.<genexpr>  r  r  r  )r  r  r   r  r	  r   r
   r   r  r+   r  r   r`   r  ry   r.   r   r   s                  r1   r?   z'UniSpeechEncoderStableLayerNorm.forwardq  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001//

 #11-@%(;;]302R6LT6R[[ 	PE#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ !&!.Te! !.a 0 , &9]1=M<O&O#)	P, 6 1]4D Dm]4EGZ$[mmm++*
 	
r0   r   r)  c                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S r+  r/  r1  s      r1   r  z1UniSpeechEncoderStableLayerNorm._update_full_mask  r2  r0   r3  )
r'   r(   r)   r7   r?   r   r+   r   r  rA   rB   s   @r1   rH  rH  e  sE    	, "<
|ellD01 ||r0   rH  c                   8     e Zd ZdZ fdZed        Zd Z xZS )UniSpeechGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                 0   t         |           |j                  | _        |j                  | _        |j                  | j                  z  dk7  r&t        d|j                   d| j                   d      t        j                  t        j                  d| j                  | j
                  z  |j                  | j                  z              | _        t        j                  |j                  d   | j                  | j
                  z        | _        d| _        y )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr   r|   r5   )r6   r7   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimr   rO   	Parameterr+   r,   codevectorsr   ri   weight_projtemperaturer   s     r1   r7   z'UniSpeechGumbelVectorQuantizer.__init__  s     6688  4??2a7)&*?*?)@ A559__4EEWY  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r0   c           	          | j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   r   gHz>r|   )meanr+   expsumlog)probsmarginal_probs
perplexitys      r1   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexity  sR    *YY		.599^VZEZ;[*[ac ddeiik
r0   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )Nr|   T)tauhardr   r         ?r{   )r   rV  r   r   r   rO   r   gumbel_softmaxr   rW  type_asr+   r   r`  argmax	new_zerosscatter_r  rU  rR  r[  )r9   r%   
batch_sizesequence_lengthrQ   codevector_probscodevector_soft_distr_  codevector_idxcodevectors_per_grouprU  s              r1   r?   z&UniSpeechGumbelVectorQuantizer.forward  s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r0   )	r'   r(   r)   r*   r7   staticmethodr`  r?   rA   rB   s   @r1   rN  rN    s&    
(  
#'r0   rN  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej                  ef   fdZded	ej                  fd
Zy)UniSpeechPreTrainedModelr`   	unispeechr   Tc           
      z   t        |t              r|j                  j                  j                  j                  dd       |j                  j                  j                  j                          t        j                  j                  |j                         yt        |t              rt        j                  j                  |j                  j                  ddt        j                  d|j                  j                   d   |j                  j"                  z  z        z         t        j                  j%                  |j                  j                  d       yt        |t&              rt        j                  d|j(                  j*                  z        }t        j                  j                  |j(                  j                  | |       t        j                  j                  |j(                  j                  | |       yt        |t        j,                        rm|j                  j                  j                  d| j.                  j0                         |j                  %|j                  j                  j                          yyt        |t        j2                  t        j4                  f      rJ|j                  j                  j                          |j                  j                  j7                  d       yt        |t        j8                        rt        j                  j;                  |j                         |j                  jt        j                  |j<                  |j"                  |j                   d   z  z        }t        j                  j                  |j                  | |       yyy)	zInitialize the weightsr   r   )rY  stdr   r5   )abNrd  )r   rN  rV  rK   datanormal_rh   zero_rO   inituniform_rU  rD   rS   mathsqrtrF   in_channels	constant_r   r   in_featuresr   r`   initializer_rangerx   r   fill_rP   kaiming_normal_rH   )r9   r   ks      r1   _init_weightsz&UniSpeechPreTrainedModel._init_weights  s    f<=%%**222C##((..0GGV//0 @AGGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 :;		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r0   input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r+   div)input_lengthrF   rg   s      r1   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length<  s"     99\K7wWZ[[[r0   )zipr`   rl   rm   )r9   r  r  rF   rg   s        r1    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths7  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r0   feature_vector_lengthr   c                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr|   r   r   )r0  devicer   )r  )cumsumr  tor+   longr   zerosr0  r  arangeflipr   )r9   r  r   non_padded_lengthsoutput_lengthsrj  s         r1   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_maskF  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr0   N)r'   r(   r)   r   r-   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr  r   r+   
LongTensorr   r  r  r/   r0   r1   rr  rr    sg    #$O&*#N9BeEDTDTVYDY>Z  ]b]m]m r0   rr  r   	mask_probmask_length	min_masksr   c                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r  num_masked_spanepsilonr  r  r  rk  s     r1   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span|  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr0   Nr|   r0  r   F)replace)r   nprandomr  itemdetachr[  tolistr   r  r   choicer  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   r  r  r   r  rj  r  r  r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  rk  s    `` `            @@r1   _compute_mask_indicesr  V  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   *    e Zd Zdef fdZ	 	 ddej                  deej                     deej                     fdZ	e
	 	 	 	 	 ddeej                     deej                     deej                     dee   d	ee   d
ee   deeef   fd       Z xZS )UniSpeechModelr`   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nr   )r6   r7   r`   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probrO   rT  r+   r   rQ   r|  masked_spec_embeddo_stable_layer_normrH  encoderr  	post_initr   s     r1   r7   zUniSpeechModel.__init__  s     !8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r0   r%   mask_time_indicesr   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r  r  r   r  )r  r0  )r  r  r  r|   )rD  r`   r   r  r  r0  r  r   r  mask_time_lengthmask_time_min_masksr+   r4  r  r   r  mask_feature_lengthmask_feature_min_masksexpand)r9   r%   r  r   rj  rk  rQ   mask_feature_indicess           r1   _mask_hidden_statesz"UniSpeechModel._mask_hidden_states  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r0   r   r   r  r  r   c                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      \  }}| j                  |||      }| j                  |||||      }	|	d   }|s
||f|	dd z   S t        |||	j                  |	j                        S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r5   )r  r   r   r   r  r  r   )r  extract_featuresr%   r&   )r`   r   r  use_return_dictr  rb   r  r   r  r  r  UniSpeechBaseModelOutputr%   r&   )
r9   r   r   r  r   r  r  r  r%   encoder_outputss
             r1   r?   zUniSpeechModel.forward  s@    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*!#34qr7JJJ'+-)77&11	
 	
r0   r  NNNNN)r'   r(   r)   r   r7   r+   r,   r   r  r  r   r   r   r   r.   r  r?   rA   rB   s   @r1   r  r    s     ( :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*2
u||,2
 !.2
 $E$5$56	2

 $D>2
 'tn2
 d^2
 
u..	/2
 2
r0   r  zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                   "    e Zd Zdef fdZdefdZd Zd Ze		 dde
j                  de
j                  d	e
j                  defd
       Ze	 	 	 	 ddee
j                     dee
j                     dee   dee   dee   deeef   fd       Z xZS )UniSpeechForPreTrainingr`   c                 .   t         |   |       t        |      | _        t	        j
                  |j                        | _        t        |      | _	        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                         | _        t	        j
                  |j$                        | _        | j)                          y rr   )r6   r7   r  rs  rO   r   feat_quantizer_dropoutdropout_featuresrN  	quantizerr   rS  proj_codevector_dim	project_qrQ   project_hidnum_ctc_classesctc_projfinal_dropoutr   r  r   s     r1   r7   z UniSpeechForPreTraining.__init__M  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r0   rW  c                 &    || j                   _        y)zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r  rW  )r9   rW  s     r1   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature\  s     &1"r0   c                 X    t        j                  dt               | j                          yz
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr9   s    r1   freeze_feature_extractorz0UniSpeechForPreTraining.freeze_feature_extractorb  '    
 	Q	

 	##%r0   c                 L    | j                   j                  j                          y
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nrs  r  r   r  s    r1   r  z.UniSpeechForPreTraining.freeze_feature_encodern      
 	((;;=r0   target_featuresnegative_featurespredicted_featuresc                     t        j                  | |gd      } t        j                  |j                         | j                         d      }|j	                  |       }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r   r|   )r+   catcosine_similarityr   rf  )r  r   r  rW  logitss        r1   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logitsu  sa      ))_6G$HaP(();)A)A)C_EZEZE\bde0 +%r0   r   r   r   r  r  r   c                    ||n| j                   j                  }| j                  |||||      }|d   }| j                  |d         }| j	                  |      \  }	}
| j                  |	j                  | j
                  j                  j                              }	| j                  |	      }	t        j                  |j                  d      |j                  d            j                  | j                   j                        }|j                  dd      }t        j                   |      j#                         j                  |j$                        }|j                  dd      }|j'                  d      }|j)                  |d      |	j)                  | d      z   }| j+                  |      }| j-                  |      }d}|s||||	|
f|dd z   S ||	|
f|dd z   S t/        |||	|
|j0                  |j2                        S )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr  r   r   r|   r   r5   )r!   r"   r#   r$   r%   r&   )r`   r  rs  r  r  r  r  rK   r0  r  r+   emptyr   r  replace_probrb   	bernoullir   r  r  masked_fillr   r  r    r%   r&   )r9   r   r   r   r  r  r  transformer_featuresr  quantized_featuresr$   prob_replace_matrixsampled_replace_matrixr  r!   s                  r1   r?   zUniSpeechForPreTraining.forward  s   * &1%<k$++B]B]..)/!5# ! 
  'qz  00<48NNCS4T11 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 24FH]^ahijikalll(*<>STW^_`_aWbbb,1'9"7!//))
 	
r0   )r   )NNNN)r'   r(   r)   r   r7   r   r  r  r  rp  r+   r,   r  r   r   r   r   r   r.   r    r?   rA   rB   s   @r1   r  r  G  s    1# 1
&> 
 	** ,, "-- 	 &  26,0/3&*D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 
u33	4D
 D
r0   r  r5   zq
    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                        e Zd Zddee   f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                     deej                     d	ee   d
ee   dee   deej                     deeef   fd       Z xZS )UniSpeechForCTCtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y)a3  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r6   r7   r  rs  rO   r   r  r   r  
vocab_sizer   r;   rU   r  output_hidden_sizerQ   r   lm_headr  )r9   r`   r  r  r;   s       r1   r7   zUniSpeechForCTC.__init__  s     	 '/zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	r0   c                     | j                   }|&t        | j                  dd      t        d| d      |-t        | j                  dd      t        j                  d       y|| j                  |d       yy)a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr8  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  rD  r`   r   loggerinfoload_adapter)r9   r  s     r1   tie_weightszUniSpeechForCTC.tie_weights  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r0   c                 X    t        j                  dt               | j                          y)r  r  Nr  r  s    r1   r  z(UniSpeechForCTC.freeze_feature_extractor  r  r0   c                 L    | j                   j                  j                          yr  r  r  s    r1   r  z&UniSpeechForCTC.freeze_feature_encoder  r  r0   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNrs  r   r   r   s     r1   freeze_base_modelz!UniSpeechForCTC.freeze_base_model  (    
 ^^..0 	(E"'E	(r0   r   r   r   r  r  labelsr   c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }| j                  |      }| j                  |      }	d}
|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |	dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }
ddd       |s|	f|t6        d z   }|
|
f|z   S |S t9        |
|	|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r  r|   )rM   r0  r   F)enabled)blank	reductionzero_infinityr!   r  r%   r&   )r`   r  r  r  r   rs  r   r  r+   	ones_liker  r  r[  r  masked_selectrO   r   log_softmaxfloat32rb   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r%   r&   )r9   r   r   r   r  r  r%  r  r%   r  r!   r  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r1   r?   zUniSpeechForCTC.forward'  s'   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]..)/!5# ! 
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIrr   r  )r'   r(   r)   r   r   r7   r  r  r  r#  r   r+   r   r   r   r.   r   r?   rA   rB   s   @r1   r  r    s    HSM :<*
&>(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r0   r  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee	j                     deeef   fd       Z xZS )"UniSpeechForSequenceClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nr  z`Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)r   )r6   r7   rU   r  r   r  rs  r
  use_weighted_layer_sumrO   rT  r+   r  layer_weightsr   rQ   classifier_proj_size	projector
num_labels
classifierr  )r9   r`   
num_layersr;   s      r1   r7   z+UniSpeechForSequenceClassification.__init__v  s     6=)f.@.@r  (/--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r0   c                 X    t        j                  dt               | j                          yr  r  r  s    r1   r  z;UniSpeechForSequenceClassification.freeze_feature_extractor  r  r0   c                 L    | j                   j                  j                          yr  r  r  s    r1   r  z9UniSpeechForSequenceClassification.freeze_feature_encoder  r  r0   c                 P    | j                   j                         D ]	  }d|_         yr!  r"  r   s     r1   r#  z4UniSpeechForSequenceClassification.freeze_base_model  r$  r0   r   r   r   r  r  r%  r   c                 <   ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }||j                  d      }
n| j                  |j                   d   |      }|j#                  d      j%                  dd|j                   d         }d	|| <   |j                  d      |j                  d      j                  dd      z  }
| j'                  |
      }d}|Ft)               } ||j                  d| j                   j*                        |j                  d            }|s|f|t        d z   }||f|z   S |S t-        |||j.                  |j0                  
      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   r|   r   r5   r   r+  )r`   r  r@  rs  r7  r+   stackrO   r   r   rA  r   r[  rC  rY  r  r   r  r  rE  r   rD  r   r%   r&   )r9   r   r   r   r  r  r%  r  r%   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r!   loss_fctr<  s                    r1   r?   z*UniSpeechForSequenceClassification.forward  s   . &1%<k$++B]B]'+{{'I'ItOc..)/!5# ! 
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M../)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r0   r  )r'   r(   r)   r7   r  r  r#  r   r   r+   r   r   r   r.   r   r?   rA   rB   s   @r1   r>  r>  o  s    "
&>(  26,0/3&*)-B
u||,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
r0   r>  )r  r  r>  r  rr  )Nr   Nr=   )Rr}  r  dataclassesr   typingr   r   r   numpyr  r+   torch.nnrO   r   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   rT   r   r   r   configuration_unispeechr   integrations.flex_attentionr   
get_loggerr'   r  r    Moduler3   rD   rd   ru   r~   r   r   r   r   r   r   r   r   r  r6  rB  rH  rN  rr  r.   r   r  ndarrayr  r  r  r  r7  r  r>  __all__r/   r0   r1   <module>rd     sG  ,   ! , ,    % ! @ 7 g B 9  G & J J 4  !J 
		H	% 
:K : :.BII *ryy *Z$> *"< 6"< 0&bii &R1 1*  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<U/ U/p299 0!6 !HZryy Zz		 2++E +\^bii ^BC'RYY C'L F F FZ 26tc?tt t U--.	t
 t ZZtn 3  s
- s
 s
l 
B
6 B

B
J !"  
S
. S

S
l p
)A p
p
fr0   