
    rhl                       d dl Z d dlZd dlmZmZ d dlZd dlZd dlm	Z	 d dl
m	c mZ d dlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlm Z m!Z!m"Z" ddl#m$Z$  e"jJ                  e&      Z' G d de	jP                        Z) G d de	jP                        Z* G d de	jP                        Z+ G d de	jP                        Z, G d de	jP                        Z- G d de      Z. G d de      Z/ G d de	jP                        Z0 G d de	jP                        Z1 G d  d!e	jP                        Z2e  G d" d#e             Z3 G d$ d%e      Z4 G d& d'e      Z5 G d( d)e      Z6 G d* d+e	jP                        Z7 G d, d-e	jP                        Z8 G d. d/e	jP                        Z9	 	 dKd0e:e;e;f   d1e<d2e;d3eejz                     d4e;d5ej|                  fd6Z?eZ@e  G d7 d8e3             ZAd9ZB e d:;       G d< d=e3             ZC e d>;       G d? d@e3             ZDe  G dA dBe3             ZE G dC dDe	jP                        ZF G dE dFe	jP                        ZG e dG;       G dH dIe3             ZHg dJZIy)L    N)OptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)PreTrainedModel)auto_docstringis_peft_availablelogging   )WavLMConfigc                   $     e Zd Z fdZd Z xZS )WavLMSamePadLayerc                 P    t         |           |dz  dk(  rd| _        y d| _        y N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__s     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/wavlm/modeling_wavlm.pyr   zWavLMSamePadLayer.__init__&   s)    #:Q#>!#Ca    c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r   r   hidden_statess     r"   forwardzWavLMSamePadLayer.forward*   s6    ")!Q0F43F3F2F0F*FGMr#   __name__
__module____qualname__r   r(   __classcell__r!   s   @r"   r   r   %   s    Kr#   r   c                   $     e Zd Z fdZd Z xZS )WavLMPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r   r   nnConv1dhidden_sizer    num_conv_pos_embedding_groupsconvutilsr5   hasattrr:   r   	deepspeedzeroGatheredParametersr7   	original0	original1weight_gweight_vregister_external_parameterr   r3   r   feat_extract_activation
activation)r   configr5   rB   rG   rH   r!   s         r"   r   z%WavLMPositionalConvEmbedding.__init__1   s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI()G)GH !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )	transposer?   r3   rK   r&   s     r"   r(   z$WavLMPositionalConvEmbedding.forwardR   sV    %//15		-0]36%//15r#   r)   r.   s   @r"   r0   r0   0   s    ABr#   r0   c                   $     e Zd Z fdZd Z xZS )WavLMFeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Neps)r   r   r;   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr=   
projectionDropoutfeat_proj_dropoutdropoutr   rL   r!   s     r"   r   zWavLMFeatureProjection.__init__^   sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r#   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS N)rY   r[   r^   )r   r'   norm_hidden_statess      r"   r(   zWavLMFeatureProjection.forwardd   s:    !__];(:;]3000r#   r)   r.   s   @r"   rQ   rQ   ]   s    <1r#   rQ   c                       e Zd ZdZ	 	 	 	 ddedededededef fdZ	 	 	 	 dd	ej                  d
e
ej                     de
ej                     dedeej                  e
ej                     e
eej                        f   f
dZd	ej                  d
eej                  ej                   f   dej                  dedej                  ej                  ff
dZdededej                  fdZdej                  dej                  fdZ xZS )WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr^   num_bucketsmax_distancehas_relative_position_biasc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        || _        || _        t        j                   t#        j$                  d| j                  dd            | _        t        j                  | j
                  d      | _        |r0t        j*                  | j                  | j                        | _        y y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )r   r   re   rf   r^   head_dim
ValueErrorscalingr;   rZ   k_projv_projq_projout_projrg   rh   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)r   re   rf   r^   rg   rh   ri   r!   s          r"   r   zWavLMAttention.__init__o   s7    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*ii	95ii	95ii	95		)Y7&(!#ejjDNNAq.Q!R"$))DMM1"=%"$,,t/?/?"PD &r#   r'   attention_maskposition_biasoutput_attentionsreturnc                     |j                         \  }}}|S| j                  ||      }|j                  d      j                  |ddd      j	                  || j
                  z  ||      }|j	                  |j                  dd | j
                  dfz         }	|	j                  dddd      }	| j                  |	      }
|
j	                  |	j                  dd dz         j                  d      }
t        j                  |
      j                  dd      \  }}||| j                  z  d	z
  z  d
z   }|j	                  || j
                  z  dd      |z  }|j	                  d||f      }| j                  ||||      \  }}|||fS )z'Attention layer with relative attentionNr   r   rS   r   r   )r      r9         ?g       @)sizecompute_bias	unsqueezerepeatviewrf   shapepermuterw   sumrt   sigmoidchunkrv   torch_multi_head_self_attention)r   r'   rz   r{   r|   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r"   r(   zWavLMAttention.forward   s    (,,.Wa   --gw?M''*11#q!Q?DDS4>>EY[bdkl  ,001D1DSb1IT^^]_L`1`a199!Q1E "&!8!89L!M!7!<!<=P=V=VWZXZ=[^d=d!e!i!ijl!m '=>DDQBDO)?)? ?# EFL *..sT^^/CRKm[166GW7MN$($H$H>+>@Q%
!\ L-77r#   r   c                 X   |j                  dd      x}x}}||j                  d      nd}dx}	}
d}t        j                  |||| j                  | j
                  t        j                  dg      t        j                  | j                  j                  | j                  j                  | j                  j                  f      |	|
|| j                  | j                  j                  | j                  j                  | j                   |||d| j                  j                  | j                  j                  | j                  j                        \  }}|j                  dd      }|C|dddf   j#                  |j$                  dd | j
                  fz   |j$                  dd z         }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)rO   neFmulti_head_attention_forwardre   rf   rt   emptycatrq   biasro   rp   r^   rr   r7   trainingbroadcast_tor   )r   r'   rz   r   r|   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr   r   s                 r"   r   z.WavLMAttention.torch_multi_head_self_attention   s    ,55a;;;e3A3M>,,Q/SW  %&$B$BNNNNKKIIt{{'')9)94;;;K;KLMLLMM  MMMM%)++,,++,,++,,+%
!\2 "++Aq1# (40==""2A&$..)::\=O=OPQPR=SSL L((r#   query_length
key_lengthc                    t        j                  |t         j                        d d d f   }t        j                  |t         j                        d d d f   }||z
  }| j                  |      }|j	                  | j
                  j                  j                        }| j                  |      }|j                  g d      }|S )Ndtype)r   r   r   )	rt   arangelong_relative_positions_buckettory   r7   devicer   )r   r   r   context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r"   r   zWavLMAttention.compute_bias   s     <<EJJG4P,,zDT1WM+.>>#'#B#BCT#U #;#>#>t?R?R?Y?Y?`?`#a $$%=>	*r#   relative_positionsc                 $   | j                   dz  }|dkD  j                  t        j                        |z  }t        j                  |      }|dz  }||k  }t        j
                  |j                         |z        }|t        j
                  | j                  |z        z  }|||z
  z  }||z   j                  t        j                        }t        j                  |t        j                  ||dz
              }|t        j                  |||      z  }|S r   )rg   r   rt   r   abslogfloatmathrh   min	full_likewhere)r   r   rg   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r"   r   z)WavLMAttention._relative_positions_bucket   s   &&!+.266uzzB[P"YY'9:1$	%	1&+ii0B0H0H0JY0V&W#&ADHHTM^M^ajMjDk&k#&A[S\E\&]#&/2M&M%Q%QRWR\R\%]"%*YY&8RT_bcTc(d&
" 	EKK2DF`aar#   )        i@  i   TNNFr   )r*   r+   r,   __doc__intr   boolr   rt   Tensorr   tupler(   FloatTensorr   
LongTensor
BoolTensorr   r   r   r-   r.   s   @r"   rd   rd   l   s   G +/"Q"Q "Q 	"Q
 "Q "Q %)"QN 2604"''8||'8 !.'8  -	'8
  '8 
u||Xell3XeELL>Q5RR	S'8R5)((5) e..0@0@@A5) #..	5)
  5) 

U..	/5)n # %BSBS  U=N=N  SXSdSd  r#   rd   c                   $     e Zd Z fdZd Z xZS )WavLMFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y ra   )r   r   r;   r\   activation_dropoutintermediate_dropoutrZ   r=   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr_   s     r"   r   zWavLMFeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r#   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S ra   )r   r   r   r   r   r&   s     r"   r(   zWavLMFeedForward.forward   sX    //>00?11-@))-8++M:r#   r)   r.   s   @r"   r   r     s    @r#   r   c                   2     e Zd Zddedef fdZddZ xZS )WavLMEncoderLayerrL   ri   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y N)re   rf   r^   rg   rh   ri   rT   r   r   rd   r=   num_attention_headsattention_dropoutrg   max_bucket_distance	attentionr;   r\   r   r^   rV   rX   rY   r   feed_forwardfinal_layer_normr   rL   ri   r!   s      r"   r   zWavLMEncoderLayer.__init__+      '((00,,**33'A
 zz&"7"78,,v'9'9v?T?TU,V4 "V-?-?VEZEZ [r#   c                     |}| j                  |||||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }||f}|r||fz  }|S )Nrz   r{   r|   r   )r   r^   rY   r   r   )	r   r'   rz   r{   r|   r   attn_residualr   outputss	            r"   r(   zWavLMEncoderLayer.forward:  s    %59^^)'/ 6D 6
2|] ]3%56%(9(9-(HH--m< -0&Gr#   Tr   r*   r+   r,   r   r   r   r(   r-   r.   s   @r"   r   r   *  s    \{ \ \r#   r   c                   2     e Zd Zddedef fdZddZ xZS ) WavLMEncoderLayerStableLayerNormrL   ri   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y r   r   r   s      r"   r   z)WavLMEncoderLayerStableLayerNorm.__init__T  r   r#   c                     |}| j                  |      }| j                  ||||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }||f}|r||fz  }|S )N)rz   r{   r|   )rY   r   r^   r   r   )r   r'   rz   r{   r|   r   r   r   s           r"   r(   z(WavLMEncoderLayerStableLayerNorm.forwardc  s    %659^^)'/	 6D 6
2|] ]3%5%(9(9$:O:OP]:^(__ -0&Gr#   r   )NNFr   r.   s   @r"   r   r   S  s    \{ \ \r#   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w NrT   r   )ri   F)r   r   rL   r0   pos_conv_embedr;   rV   r=   rX   rY   r\   r   r^   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointingr   rL   ir!   s      r"   r   zWavLMEncoder.__init__y  s    :6B,,v'9'9v?T?TUzz&"7"78mmUZ[a[s[sUtuPQv16Ku
 ',# v   !Cc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  |      }	||	z   }| j	                  |      }| j                  |      }t               xs t        |       }
d }t        | j                        D ]y  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|
r ||||||      }|d d \  }}|rd}|sq|d   fz   }{ |r||fz   }|st        d |||fD              S t!        |||	      S )
N rS   r   r   r   r   NNNc              3   &   K   | ]	  }||  y wra   r  .0vs     r"   	<genexpr>z'WavLMEncoder.forward.<locals>.<genexpr>       mq_`_lm   last_hidden_stater'   
attentions)r   r   r   r   rY   r^   r   r	   	enumerater  rt   randr   rL   	layerdropr   r   r   r'   rz   r|   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr{   r  layerdropout_probabilityskip_the_layerlayer_outputss                    r"   r(   zWavLMEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001"11-@%(;;6]302R6LT6R!$++. 	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ %!#1"/&7! 0=Ra/@,} 2 &9]1=M<O&O#1	P4   1]4D Dm]4EGZ$[mmm++*
 	
r#   NFFTr)   r.   s   @r"   r   r   x  s    	, ";
r#   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderStableLayerNormc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w r   )r   r   rL   r0   r   r;   rV   r=   rX   rY   r\   r   r^   r   r   r  r   r  r  r  s      r"   r   z$WavLMEncoderStableLayerNorm.__init__  s    :6B,,v'9'9v?T?TUzz&"7"78mm v778 1UVZ[U[]
 ',#r  c                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  |      }	||	z   }| j	                  |      }t               xs t        |       }
d }t        | j                        D ]x  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|
r |||||      }|d d \  }}|rd}|sp|d   fz   }z | j                  |      }|r||fz   }|st        d |||fD              S t!        |||	      S )
Nr  rS   r   r   r   )rz   r|   r{   r	  c              3   &   K   | ]	  }||  y wra   r  r  s     r"   r  z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>  r  r  r  )r   r   r   r   r^   r   r	   r  r  rt   r  r   rL   r  rY   r   r   r  s                    r"   r(   z#WavLMEncoderStableLayerNorm.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001"11-@%(;;]302R6LT6R!$++. 	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ !&!#1&7"/	! 0=Ra/@,} 2 &9]1=M<O&O#/	P2 6 1]4D Dm]4EGZ$[mmm+;LYl
 	
r#   r#  r)   r.   s   @r"   r%  r%    s    ," "9
r#   r%  c                   8     e Zd ZdZ fdZed        Zd Z xZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                 0   t         |           |j                  | _        |j                  | _        |j                  | j                  z  dk7  r&t        d|j                   d| j                   d      t        j                  t        j                  d| j                  | j
                  z  |j                  | j                  z              | _        t        j                  |j                  d   | j                  | j
                  z        | _        d| _        y )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rS   r   )r   r   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimrm   r;   rs   rt   r   codevectorsrZ   rW   weight_projtemperaturer_   s     r"   r   z#WavLMGumbelVectorQuantizer.__init__  s     6688  4??2a7)&*?*?)@ A66:oo5F G%%  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r#   c           	          | j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   r   gHz>rS   )meanrt   expr   r   )probsmarginal_probs
perplexitys      r"   _compute_perplexityz.WavLMGumbelVectorQuantizer._compute_perplexity(  sR    *YY		.599^VZEZ;[*[ac ddeiik
r#   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      }|j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )NrS   T)tauhardr   r   r   )r   r2  r   r-  r   r;   
functionalgumbel_softmaxr   r3  type_asrt   softmaxr:  argmax	new_zerosscatter_r   r1  r/  r   )r   r'   
batch_sizesequence_lengthr=   codevector_probscodevector_soft_distr9  codevector_idxcodevectors_per_groupr1  s              r"   r(   z"WavLMGumbelVectorQuantizer.forward.  s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;M<O<O<QW[WgWgnr;s/77F $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r#   )	r*   r+   r,   r   r   staticmethodr:  r(   r-   r.   s   @r"   r*  r*    s&    
*  
"'r#   r*  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
d Z	 ddeej                  ef   d	ee   fd
Z	 ddedej                  fdZy)WavLMPreTrainedModelrL   wavlminput_valuesTFc           
      z   t        |t              r|j                  j                  j                  j                  dd       |j                  j                  j                  j                          t        j                  j                  |j                         yt        |t              rt        j                  j                  |j                  j                  ddt        j                  d|j                  j                   d   |j                  j"                  z  z        z         t        j                  j%                  |j                  j                  d       yt        |t&              rt        j                  d|j(                  j*                  z        }t        j                  j                  |j(                  j                  | |       t        j                  j                  |j(                  j                  | |       yt        |t        j,                        rm|j                  j                  j                  d| j.                  j0                         |j                  %|j                  j                  j                          yyt        |t        j2                  t        j4                  f      rJ|j                  j                  j                          |j                  j                  j7                  d       yt        |t        j8                        rt        j                  j;                  |j                         |j                  jt        j                  |j<                  |j"                  |j                   d   z  z        }t        j                  j                  |j                  | |       yyy)	zInitialize the weightsr   r   )r5  stdr   r   )abNr   )r   r*  r2  r7   datanormal_r   zero_r;   inituniform_r1  r0   r?   r   sqrtr2   in_channels	constant_rQ   r[   in_featuresrZ   rL   initializer_rangerV   	GroupNormfill_r<   kaiming_normal_r4   )r   moduleks      r"   _init_weightsz"WavLMPreTrainedModel._init_weights]  s    f89%%**222C##((..0GGV//0 <=GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 67		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r#   Ninput_lengthsadd_adapterc                 T   || j                   j                  n|}d }t        | j                   j                  | j                   j                        D ]  \  }} ||||      } |rBt        | j                   j                        D ]   } ||d| j                   j                        }" |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )rt   divinput_lengthr2   strides      r"   _conv_out_lengthzOWavLMPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s"     99\K7wWZ[[[r#   r   )rL   rf  zipconv_kernelconv_strider   num_adapter_layersadapter_stride)r   re  rf  ro  r2   rn  r   s          r"    _get_feat_extract_output_lengthsz5WavLMPreTrainedModel._get_feat_extract_output_lengths~  s     2=1Ddkk--+	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q 4;;99: _ 04;;C]C] ^_ r#   feature_vector_lengthrz   c                     |j                  d      d d df   }| j                  ||      }|j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )NrS   r   rf  r   )r   r   r   )r   )cumsumru  r   rt   r   r   zerosr   r   r   flipr   )r   rv  rz   rf  non_padded_lengthsoutput_lengthsrF  s          r"   "_get_feature_vector_attention_maskz7WavLMPreTrainedModel._get_feature_vector_attention_mask  s    
 ,22r2:1b5A>>?Q_j>k'**5::6#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr#   ra   )r*   r+   r,   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrd  r   rt   r   r   r   r   ru  r~  r  r#   r"   rN  rN  S  s    $O&*# N9D Z^"5#3#3S#89HPQU0 Y]%(:?:J:Jr#   rN  c                   &     e Zd Zd fd	Zd Z xZS )WavLMNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   r2   rn  r   )r   r   rW   in_conv_dimout_conv_dimr;   r<   rq  rr  	conv_biasr?   r   rJ   rK   r   rL   layer_idr!   s      r"   r   z"WavLMNoLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r#   c                 J    | j                  |      }| j                  |      }|S ra   )r?   rK   r&   s     r"   r(   z!WavLMNoLayerNormConvLayer.forward  s$    		-06r#   r   r)   r.   s   @r"   r  r    s    Ar#   r  c                   &     e Zd Zd fd	Zd Z xZS )WavLMLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   r  T)elementwise_affine)r   r   rW   r  r  r;   r<   rq  rr  r  r?   rV   rY   r   rJ   rK   r  s      r"   r   z WavLMLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r#   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr>  rS   )r?   rO   rY   rK   r&   s     r"   r(   zWavLMLayerNormConvLayer.forward  sV    		-0%//B76%//B76r#   r  r)   r.   s   @r"   r  r    s    Ar#   r  c                   &     e Zd Zd fd	Zd Z xZS )WavLMGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   r  T)r-  num_channelsaffine)r   r   rW   r  r  r;   r<   rq  rr  r  r?   r   rJ   rK   r_  rY   r  s      r"   r   z WavLMGroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr#   c                 l    | j                  |      }| j                  |      }| j                  |      }|S ra   )r?   rY   rK   r&   s     r"   r(   zWavLMGroupNormConvLayer.forward  s2    		-066r#   r  r)   r.   s   @r"   r  r    s    r r#   r  c                   .     e Zd ZdZ fdZd Zd Z xZS )WavLMFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r  r   r  z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r   feat_extract_normr  r   num_feat_extract_layersr  r  rm   r;   r   conv_layersr  _requires_grad)r   rL   r  r  r!   s       r"   r   zWavLMFeatureEncoder.__init__  s    ##w.26AFGKPQWQoQorsQsKtKFG)&1q5AK K %%0PUV\VtVtPuv126AFvKv01I1I0JJst  ==5&+#"K ws   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_gradr  r   params     r"   _freeze_parametersz&WavLMFeatureEncoder._freeze_parameters  s(    __& 	(E"'E	(#r#   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S )NT)r  r   r  r  )r   rP  r'   
conv_layers       r"   r(   zWavLMFeatureEncoder.forward  sP    $QW- 4==*.M'** 	6J&}5M	6 r#   )r*   r+   r,   r   r   r  r(   r-   r.   s   @r"   r  r    s    8#"$

r#   r  c                   $     e Zd Z fdZd Z xZS )WavLMAdapterLayerc                     t         |           t        j                  |j                  d|j                  z  |j
                  |j                  d      | _        y )Nr   r   )rn  r3   )r   r   r;   r<   output_hidden_sizeadapter_kernel_sizert  r?   r_   s     r"   r   zWavLMAdapterLayer.__init__  sJ    II%%)))&&((
	r#   c                 j    | j                  |      }t        j                  j                  |d      }|S )Nr   r   )r?   r;   r?  glur&   s     r"   r(   zWavLMAdapterLayer.forward#  s/    		-0))-Q)?r#   r)   r.   s   @r"   r  r    s    
r#   r  c                   $     e Zd Z fdZd Z xZS )WavLMAdapterc                    t         |           j                  j                  k7  rTt	        j
                  j                  j                        | _        t	        j                  j                        | _        nd x| _        | _        t	        j                  fdt        j                        D              | _        j                  | _        y )Nc              3   4   K   | ]  }t                y wra   )r  )r  r   rL   s     r"   r  z(WavLMAdapter.__init__.<locals>.<genexpr>5  s     #h!$5f$=#hs   )r   r   r  r=   r;   rZ   projrV   proj_layer_normr   r   rs  r  r  r_   s    `r"   r   zWavLMAdapter.__init__+  s     $$(:(::		&"4"4f6O6OPDI#%<<0I0I#JD /33DI,mm#huVMfMfGg#hh))r#   c                 h   | j                   .| j                  "| j                  |      }| j                  |      }|j                  dd      }| j                  D ]D  }t        j
                  j                         }| j                  r|| j                  kD  s= ||      }F |j                  dd      }|S rN   )r  r  rO   r  nprandomr   r  )r   r'   r  layerdrop_probs       r"   r(   zWavLMAdapter.forward8  s    99 T%9%9%E IIm4M 00?M%//15[[ 	5EYY--/N==^dnn%D %m 4	5
 &//15r#   r)   r.   s   @r"   r  r  *  s    *r#   r  r   	mask_probmask_lengthrz   	min_masksr}   c                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rm  num_masked_spanepsilonr  r  r  rG  s     r"   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spano  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr#   NrS   r   r   F)replace)rm   r  r  r  itemdetachr   tolistr   rz  r   choicer   lenconcatenateru   int32appendarrayr   reshaper  put_along_axis)r   r  r  rz   r  rF  r  r   re  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrm  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  rG  s    `` `            @@r"   _compute_mask_indicesr  I  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   6    e Zd Zdef fdZd Zd Z	 	 ddej                  de	ej                     de	ej                     fdZe	 	 	 	 	 dd	e	ej                     de	ej                     de	ej                     d
e	e   de	e   de	e   deeef   fd       Z xZS )
WavLMModelrL   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        |j(                  rt+        |      nd | _        | j/                          y )Nr   )r   r   rL   r  feature_extractorrQ   feature_projectionmask_time_probmask_feature_probr;   rs   rt   r   r=   rY  masked_spec_embeddo_stable_layer_normr%  encoderr   rf  r  adapter	post_initr_   s     r"   r   zWavLMModel.__init__  s     !4V!<"8"@   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&6v>DL'/DL/5/A/A|F+t 	r#   c                 X    t        j                  dt               | j                          yz
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr   s    r"   freeze_feature_extractorz#WavLMModel.freeze_feature_extractor  '    
 	Q	

 	##%r#   c                 8    | j                   j                          y
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  r  s    r"   r  z!WavLMModel.freeze_feature_encoder  s    
 	113r#   r'   mask_time_indicesrz   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r  r  rz   r  )r   r   )r  r  r  rS   )getattrrL   r   r  r   r   r  r   r  mask_time_lengthmask_time_min_masksrt   tensorr   r   r  mask_feature_lengthmask_feature_min_masksexpand)r   r'   r  rz   rF  rG  r=   mask_feature_indicess           r"   _mask_hidden_stateszWavLMModel._mask_hidden_states  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r#   rP  r|   r  r  r}   c                 H   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|!| j                  |j                  d   |d      }| j                  |      \  }}| j                  |||      }| j                  |||||      }	|	d   }| j                  | j                  |      }|s
||f|	dd z   S t        |||	j                  |	j                  	      S )
a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   Frx  )r  rz   rz   r|   r  r  r   )r  extract_featuresr'   r  )rL   r|   r  use_return_dictr  rO   r~  r   r  r  r  r  WavLMBaseModelOutputr'   r  )
r   rP  rz   r  r|   r  r  r  r'   encoder_outputss
             r"   r(   zWavLMModel.forward  sb    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DD &&q)>u E N +/*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*<<# LL7M!#34qr7JJJ#+-)77&11	
 	
r#   )NNNNNNN)r*   r+   r,   r   r   r  r  rt   r   r   r   r  r   r   r   r   r   r  r(   r-   r.   s   @r"   r  r    s    { (
&4 :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*7
u||,7
 !.7
 $E$5$56	7

 $D>7
 'tn7
 d^7
 
u**	+7
 7
r#   r  r   zm
    WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddee   f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                     deej                     d	ee   d
ee   dee   deej                     deeef   fd       Z xZS )WavLMForCTCtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y)a/  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`WavLMForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.rf  )r   r   r  rO  r;   r\   final_dropoutr^   r
  
vocab_sizerm   r!   rA   rf  r  r=   rZ   lm_headr  )r   rL   r
  r  r!   s       r"   r   zWavLMForCTC.__init__^  s     	 '
zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	r#   c                     | j                   }|&t        | j                  dd      t        d| d      |-t        | j                  dd      t        j                  d       y|| j                  |d       yy)a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r
  r  rL   rm   loggerinfoload_adapter)r   r
  s     r"   tie_weightszWavLMForCTC.tie_weights{  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r#   c                 X    t        j                  dt               | j                          yr  r  Nr  r  s    r"   r  z$WavLMForCTC.freeze_feature_extractor  r  r#   c                 L    | j                   j                  j                          yr  rO  r  r  r  s    r"   r  z"WavLMForCTC.freeze_feature_encoder      
 	

$$779r#   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNrO  r  r  r  s     r"   freeze_base_modelzWavLMForCTC.freeze_base_model  (    
 ZZ**, 	(E"'E	(r#   rP  rz   r|   r  r  labelsr}   c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }| j                  |      }| j                  |      }	d}
|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |	dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }
ddd       |s|	f|t6        d z   }|
|
f|z   S |S t9        |
|	|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r   rS   )r9   r   r   F)enabled)blank	reductionzero_infinitylosslogitsr'   r  )rL   r  r  r  rm   rO  r^   r  rt   	ones_liker   ru  r   r   masked_selectr;   r?  log_softmaxfloat32rO   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r'   r  )r   rP  rz   r|   r  r  r   r   r'   r(  r'  re  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r"   r(   zWavLMForCTC.forward  s'   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]**)/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIra   r  )r*   r+   r,   r   r   r   r  r  r  r  r   rt   r   r   r   r   r   r(   r-   r.   s   @r"   r	  r	  X  s    HSM :<*
&:(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r#   r	  z
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee	j                     deeef   fd       Z xZS )WavLMForSequenceClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nrf  z\Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   rA   rf  rm   r  rO  r  use_weighted_layer_sumr;   rs   rt   ru   layer_weightsrZ   r=   classifier_proj_size	projector
num_labels
classifierr  r   rL   
num_layersr!   s      r"   r   z'WavLMForSequenceClassification.__init__  s     6=)f.@.@n   '
--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r#   c                 X    t        j                  dt               | j                          yr  r  r  s    r"   r  z7WavLMForSequenceClassification.freeze_feature_extractor  r  r#   c                 L    | j                   j                  j                          yr  r  r  s    r"   r  z5WavLMForSequenceClassification.freeze_feature_encoder  r  r#   c                 P    | j                   j                         D ]	  }d|_         yr  r  r  s     r"   r  z0WavLMForSequenceClassification.freeze_base_model  r  r#   rP  rz   r|   r  r  r   r}   c                 <   ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }||j                  d      }
n| j                  |j                   d   |      }|j#                  d      j%                  dd|j                   d         }d	|| <   |j                  d      |j                  d      j                  dd      z  }
| j'                  |
      }d}|Ft)               } ||j                  d| j                   j*                        |j                  d            }|s|f|t        d z   }||f|z   S |S t-        |||j.                  |j0                  
      S )	  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   rS   r   r   r   r&  )rL   r  r=  rO  r4  rt   stackr;   r?  rB  r>  r   r   r@  r5  r~  r   r   r   rB  r   rA  r   r'   r  )r   rP  rz   r|   r  r  r   r   r'   norm_weightspooled_outputpadding_maskexpand_padding_maskr(  r'  loss_fctr9  s                    r"   r(   z&WavLMForSequenceClassification.forward&  s   . &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M../)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r#   r  )r*   r+   r,   r   r  r  r  r   r   rt   r   r   r   r   r   r(   r-   r.   s   @r"   r;  r;    s    "
&:(  26,0/3&*)-B
u||,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
r#   r;  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   d	ee   d
ee   deeef   fd       Z xZS ) WavLMForAudioFrameClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        |j                   | _        | j%                          y )Nrf  z_Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   rA   rf  rm   r  rO  r  r=  r;   rs   rt   ru   r>  rZ   r=   rA  rB  init_weightsrC  s      r"   r   z)WavLMForAudioFrameClassification.__init__n  s     6=)f.@.@q   '
--1
((!#ejj.Dz.Q!RD))F$6$68I8IJ ++r#   c                 X    t        j                  dt               | j                          yr  r  r  s    r"   r  z9WavLMForAudioFrameClassification.freeze_feature_extractor~  r  r#   c                 L    | j                   j                  j                          yr  r  r  s    r"   r  z7WavLMForAudioFrameClassification.freeze_feature_encoder  r  r#   c                 P    | j                   j                         D ]	  }d|_         yr  r  r  s     r"   r  z2WavLMForAudioFrameClassification.freeze_base_model  r  r#   rP  rz   r   r|   r  r  r}   c           	         ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }
d}|\t               } ||
j                  d| j                        t        j                   |j                  d| j                        d            }|s|
f|t        d z   }|S t#        ||
|j$                  |j&                  	      S )
rI  NTr  r   r   rS   r   )axisr&  )rL   r  r=  rO  r4  rt   rJ  r;   r?  rB  r>  r   r   rB  r   rA  rC  r   r'   r  )r   rP  rz   r   r|   r  r  r   r'   rK  r(  r'  rO  r9  s                 r"   r(   z(WavLMForAudioFrameClassification.forward  sh   . &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM/')HFKKDOO<ell6;;WY[_[j[jKkrs>tuDY)F)G!HHFM$!//))	
 	
r#   r  )r*   r+   r,   r   r  r  r  r   r   rt   r   r   r   r   r   r(   r-   r.   s   @r"   rQ  rQ  l  s     
&:(  26)-,0/3&*9
u||,9
 !.9
 &	9

 $D>9
 'tn9
 d^9
 
u++	,9
 9
r#   rQ  c                   &     e Zd Zd fd	Zd Z xZS )AMSoftmaxLossc                     t         |           || _        || _        || _        t        j                  t        j                  ||      d      | _	        t        j                         | _        y )NT)r  )r   r   scalemarginrA  r;   rs   rt   randnr7   r   r'  )r   	input_dimrA  r\  r]  r!   s        r"   r   zAMSoftmaxLoss.__init__  sQ    
$ll5;;y*#EUYZ'')	r#   c                    |j                         }t        j                  j                  | j                  d      }t        j                  j                  |d      }t        j                  ||      }|| j                  z
  }t        j                  j                  || j                        }| j                  t        j                  |j                         ||      z  }| j                  ||      }|S )Nr   r   r   )flattenr;   r?  	normalizer7   rt   mmr]  one_hotrA  r\  r   r   r'  )	r   r'   r   r7   	cos_thetapsionehotr(  r'  s	            r"   r(   zAMSoftmaxLoss.forward  s    !((!(<//1/EHH]F3	$++%&&vt?ekk&++-iHHyy(r#   )g      >@g?r)   r.   s   @r"   rZ  rZ    s    *r#   rZ  c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )	TDNNLayerc                    t         |           |dkD  r|j                  |dz
     n|j                  |   | _        |j                  |   | _        |j
                  |   | _        |j                  |   | _        t        j                  | j                  | j                  z  | j                        | _        t        j                         | _        y )Nr   r   )r   r   tdnn_dimr  r  tdnn_kernelr2   tdnn_dilationdilationr;   rZ   kernelReLUrK   r  s      r"   r   zTDNNLayer.__init__  s    <DqL6??8a<8foo^fNg"OOH5!--h7,,X6ii 0 043C3C CTEVEVW'')r#   r'   r}   c                 &   t               rddlm} t               r+t        | j                        rt        j                  d       |j                  dd      }| j                  j                  j                  | j                  | j                  | j                        j                  dd      }t        j                  j                  ||| j                  j                   | j"                        }|j                  dd      }| j%                  |      }|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   r   )rn  )r   peft.tuners.lorarr  r   ro  r  r  rO   r7   r   r  r2   r  r;   r?  conv1dr   rn  rK   )r   r'   rr  r7   s       r"   r(   zTDNNLayer.forward  s    2$++y1O &//15##(():):D<L<LdN^N^_iijkmno,,]FDKKDTDT_c_l_l,m%//156r#   r  )r*   r+   r,   r   rt   r   r(   r-   r.   s   @r"   ri  ri    s#    $U\\ ell r#   ri  zi
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                        e Zd Z fdZd Zd Zd Zdeej                  e
f   fdZe	 	 	 	 	 ddeej                     deej                     d	ee   d
ee   dee   deej                     deeef   fd       Z xZS )WavLMForXVectorc                    t         |   |       t        |      | _        |j                  dz   }|j
                  r0t        j                  t        j                  |      |z        | _
        t        j                  |j                  |j                  d         | _        t        t!        |j                              D cg c]  }t#        ||       }}t        j$                  |      | _        t        j                  |j                  d   dz  |j(                        | _        t        j                  |j(                  |j(                        | _        t/        |j(                  |j0                        | _        | j5                          y c c}w )Nr   r   rS   r   )r   r   r  rO  r  r=  r;   rs   rt   ru   r>  rZ   r=   rk  r@  r   r  ri  r   tdnnxvector_output_dimr  rB  rZ  rA  	objectiverS  )r   rL   rD  r  tdnn_layersr!   s        r"   r   zWavLMForXVector.__init__  s    '
--1
((!#ejj.Dz.Q!RD6#5#5vq7IJ5:3v;O5PQy+QQMM+.	!#6??2+>+BFD]D]!^))F$=$=v?X?XY&v'@'@&BSBST Rs   >Fc                 X    t        j                  dt               | j                          yr  r  r  s    r"   r  z(WavLMForXVector.freeze_feature_extractor&  r  r#   c                 L    | j                   j                  j                          yr  r  r  s    r"   r  z&WavLMForXVector.freeze_feature_encoder2  r  r#   c                 P    | j                   j                         D ]	  }d|_         yr  r  r  s     r"   r  z!WavLMForXVector.freeze_base_model9  r  r#   re  c                 V    d }| j                   j                  D ]  } |||d      } |S )z?
        Computes the output length of the TDNN layers
        c                     | |z
  |z  dz   S )Nr   r  rl  s      r"   ro  zBWavLMForXVector._get_tdnn_output_lengths.<locals>._conv_out_lengthF  s     !;.69A==r#   r   )rL   rl  )r   re  ro  r2   s       r"   _get_tdnn_output_lengthsz(WavLMForXVector._get_tdnn_output_lengthsA  s:    
	>
  ;;22 	LK,]KKM	L r#   rP  rz   r|   r  r  r   r}   c                    ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }| j                  D ]
  }
 |
|      } |%|j                  d      }|j!                  d      }n| j#                  |j                  d            }| j%                  |      }g }g }t'        |      D ]U  \  }}|j)                  ||d|f   j                  d             |j)                  ||d|f   j!                  d             W t        j                  |      }t        j                  |      }t        j*                  ||gd      }| j-                  |      }| j/                  |      }d}|| j1                  ||      }|s||f|t        d z   }||f|z   S |S t3        ||||j4                  |j6                        S )	rI  NTr  r   r   rS   r   )r'  r(  
embeddingsr'   r  )rL   r  r=  rO  r4  rt   rJ  r;   r?  rB  r>  r   r   r@  rx  r5  rR  ru  r  r  r  r   r  rB  rz  r   r'   r  )r   rP  rz   r|   r  r  r   r   r'   rK  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr  lengthstatistic_poolingoutput_embeddingsr(  r'  r9  s                         r"   r(   zWavLMForXVector.forwardP  s   . &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5)) 	6J&}5M	6 !)..1.5M(,,,3L*.*O*OP^PbPbghPbPi*j'"&"?"?@["\ML&':; J	6$$]1gvg:%>%C%C%C%JK##M!WfW*$=$A$Aa$A$HIJ "KK6M ;;|4L!II}l&CL 223DE!23>>&&1D/07;X;Y3ZZF)-)9TGf$EvE(!//))
 	
r#   r  )r*   r+   r,   r   r  r  r  r   rt   r   r   r  r   r   r   r   r   r   r(   r-   r.   s   @r"   rv  rv    s    &
&:(eE<L<Lc<Q6R   26,0/3&*)-O
u||,O
 !.O
 $D>	O

 'tnO
 d^O
 &O
 
um#	$O
 O
r#   rv  )rQ  r	  r;  rv  r  rN  r%   )Jr   r  typingr   r   numpyr  rt   torch.nnr;   torch.nn.functionalr?  r   r   activationsr   integrations.deepspeedr   integrations.fsdpr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r@   r   r   r   configuration_wavlmr   
get_loggerr*   r  Moduler   r0   rQ   rd   r   r   r   r   r%  r*  rN  r  r  r  r  r  r  r   r   r   r   ndarrayr  r  r  r4  r	  r;  rQ  rZ  ri  rv  __all__r  r#   r"   <module>r     s     "      % ! @ 7 9  . ? ? , 
		H	%		 *299 *Z1RYY 1c RYY c Lryy 0&2 &R"'A "JG
299 G
TH
")) H
VC' C'L S? S Sl : *8 68 0#")) #L		 $299 F 26tc?tt t U--.	t
 t ZZtn /  N
% N
 N
b !"  
S
& S

S
l p
%9 p
p
f f
'; f
 f
RBII .		 @ 
N
* N

N
br#   