
    rh_                        d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	 d dlm
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&  e#       rddl'm(Z(  e$jR                  e*      Z+ G d de	jX                        Z- G d de	jX                        Z. G d de      Z/ G d de      Z0 G d de      Z1 G d de	jX                        Z2 G d de	jX                        Z3	 	 	 dId e	jX                  d!ejh                  d"ejh                  d#ejh                  d$eejh                     d%ee5   d&e5d'eejh                     fd(Z6 G d) d*e	jX                        Z7 G d+ d,e	jX                        Z8 G d- d.e      Z9 G d/ d0e	jX                        Z: G d1 d2e	jX                        Z; G d3 d4e      Z< G d5 d6e	jX                        Z=e" G d7 d8e             Z>	 	 dJd9e?e@e@f   d:e5d;e@d$eej                     d<e@d=ej                  fd>ZCe" G d? d@e>             ZDdZE e"dAB       G dC dDe>             ZF e"dEB       G dF dGe>             ZGg dHZHy)K    N)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )HubertConfig)make_flex_block_causal_maskc                   $     e Zd Z fdZd Z xZS )HubertPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        d | _        |j                  r&t        j                  |j                        | _        nt        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j"                  j%                  | j                  j&                  d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j&                  j(                  }| j                  j                  j&                  j*                  }n,| j                  j,                  }| j                  j.                  }|j"                  j1                  | |       |j"                  j1                  | |       n || j                  dd      | _        t3        |j
                        | _        t6        |j8                     | _        y # 1 sw Y   'xY w)	N   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr"   hasattrr(   r	   	deepspeedzeroGatheredParametersr%   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr    r   feat_extract_activation
activation)selfconfigr"   r6   r;   r<   	__class__s         }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/hubert/modeling_hubert.pyr*   z&HubertPositionalConvEmbedding.__init__3   s   II6622a777
	 %% nnV-?-?@DO((..Krxx00-@ hh77CC)+ ^^66tyy7G7GWX6Y M +DIIH! LDIM499&89#yy99@@JJH#yy99@@JJH#yy11H#yy11H::4J::4J'		aH	)&*H*HI !?!?@M Ms   ?I??J	c                     |j                  dd      }| j                  | j                  |      }| j                  |      }| j                  |      }| j	                  |      }|j                  dd      }|S )Nr   r   )	transposer1   r0   r    r@   rA   hidden_statess     rD   forwardz%HubertPositionalConvEmbedding.forwardX   sn    %//15??& OOM:M		-0]36%//15    __name__
__module____qualname__r*   rI   __classcell__rC   s   @rD   r   r   2   s    #AJ	rJ   r   c                   $     e Zd Z fdZd Z xZS )r>   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )Nr   r   r   )r)   r*   num_pad_remove)rA   r.   rC   s     rD   r*   zHubertSamePadLayer.__init__e   s)    #:Q#>!#CarJ   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )rS   rG   s     rD   rI   zHubertSamePadLayer.forwardi   s6    ")!Q0F43F3F2F0F*FGMrJ   rK   rP   s   @rD   r>   r>   d   s    KrJ   r>   c                   &     e Zd Zd fd	Zd Z xZS )HubertNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   r   stridebias)r)   r*   conv_dimin_conv_dimout_conv_dimr+   r,   conv_kernelconv_stride	conv_biasr0   r   r?   r@   rA   rB   layer_idrC   s      rD   r*   z#HubertNoLayerNormConvLayer.__init__p   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@rJ   c                 J    | j                  |      }| j                  |      }|S N)r0   r@   rG   s     rD   rI   z"HubertNoLayerNormConvLayer.forward~   s$    		-06rJ   r   rK   rP   s   @rD   rW   rW   o   s    ArJ   rW   c                   &     e Zd Zd fd	Zd Z xZS )HubertLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rY   T)elementwise_affine)r)   r*   r\   r]   r^   r+   r,   r_   r`   ra   r0   	LayerNorm
layer_normr   r?   r@   rb   s      rD   r*   z!HubertLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@rJ   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )N)r0   rF   rl   r@   rG   s     rD   rI   z HubertLayerNormConvLayer.forward   sV    		-0%//B76%//B76rJ   rf   rK   rP   s   @rD   rh   rh      s    ArJ   rh   c                   &     e Zd Zd fd	Zd Z xZS )HubertGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rY   T)
num_groupsnum_channelsaffine)r)   r*   r\   r]   r^   r+   r,   r_   r`   ra   r0   r   r?   r@   	GroupNormrl   rb   s      rD   r*   z!HubertGroupNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqrJ   c                 l    | j                  |      }| j                  |      }| j                  |      }|S re   )r0   rl   r@   rG   s     rD   rI   z HubertGroupNormConvLayer.forward   s2    		-066rJ   rf   rK   rP   s   @rD   rq   rq      s    r rJ   rq   c                   .     e Zd ZdZ fdZd Zd Z xZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )rc   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r)   r*   feat_extract_normrq   rangenum_feat_extract_layersrW   rh   
ValueErrorr+   
ModuleListconv_layersgradient_checkpointing_requires_grad)rA   rB   ir   rC   s       rD   r*   zHubertFeatureEncoder.__init__   s    ##w.3FQGHLQRXRpRpstRtLuLGH*6AEBL K %%0QVW]WuWuQvwA3FQGwKw01I1I0JJst  ==5&+#"L xs   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y NF)
parametersrequires_gradr   rA   params     rD   _freeze_parametersz'HubertFeatureEncoder._freeze_parameters   s(    __& 	(E"'E	(#rJ   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S )NT)r   trainingr   r   )rA   input_valuesrH   
conv_layers       rD   rI   zHubertFeatureEncoder.forward   sP    $QW- 4==*.M'** 	6J&}5M	6 rJ   )rL   rM   rN   __doc__r*   r   rI   rO   rP   s   @rD   ry   ry      s    8#"$

rJ   ry   c                   $     e Zd Z fdZd Z xZS )HubertFeatureProjectionc                 n   t         |           |j                  | _        | j                  r3t        j                  |j
                  d   |j                        | _        t        j                  |j
                  d   |j                        | _
        t        j                  |j                        | _        y )Nro   eps)r)   r*   feat_proj_layer_normr+   rk   r\   layer_norm_epsrl   Linearr-   
projectionDropoutfeat_proj_dropoutdropoutrA   rB   rC   s     rD   r*   z HubertFeatureProjection.__init__   s}    $*$?$?!$$ ll6??2+>FDYDYZDO))FOOB$79K9KLzz&":":;rJ   c                     | j                   r| j                  |      }| j                  |      }| j                  |      }|S re   )r   rl   r   r   rG   s     rD   rI   zHubertFeatureProjection.forward   s;    $$ OOM:M6]3rJ   rK   rP   s   @rD   r   r      s    <rJ   r   modulequerykeyvalueattention_maskscalingr   	head_maskc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }	||	|z   }	t        j
                  j                  |	d      }	||	|j                  dddd      z  }	t        j
                  j                  |	|| j                        }	t        j                  |	|      }
|
j                  dd      j                         }
|
|	fS )Nro         r   r   r'   r   )pr   )sizetorchmatmulrF   r+   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              rD   eager_attention_forwardr      s     **R.D(<<s}}Q':;gEL!#n4==((2(>L#innQAq&AA==((6??([L,,|U3K''1-88:K$$rJ   c                   H    e Zd ZdZ	 	 	 	 	 ddededededededee   f fd	Z		 	 	 	 dd
e
j                  dee
j                     dee
j                     dee
j                     dee   dee   dee
j                  ee
j                     eee
j                        f   fdZ xZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderr[   	is_causalrB   c                 
   t         |           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r[   )r)   r*   r   r   r   head_dimrB   r   r   r   r   r+   r   k_projv_projq_projout_proj)	rA   r   r   r   r   r[   r   rB   rC   s	           rD   r*   zHubertAttention.__init__  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBrJ   rH   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                    |du}|j                   dd \  }}	|r|j                   d   n|	}
||	d| j                  f}||
d| j                  f} | j                  |      j                  | j	                  dd      }|r|n|} | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }t        }| j                  j                  dk7  rt        | j                  j                     } || ||||f| j                  sdn| j                  | j                  ||d|\  }}|j                  ||	d      j                         }| j!                  |      }||dfS )z#Input shape: Batch x Time x ChannelNro   r   r   eager        )r   r   r   r   )shaper   r   r   rF   r   r   r   rB   _attn_implementationr   r   r   r   reshaper   r   )rA   rH   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       rD   rI   zHubertAttention.forward/  s    .T9 %**3B/W/A"((+wgr4==9wDMM: 7t{{=166FPPQRTUV-?)]5T[[055~FPPQRTUV
7t{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#w;FFHmmK0L$..rJ   )r   FTFN)NNNF)rL   rM   rN   r   intfloatboolr   r   r*   r   Tensorr   r   tuplerI   rO   rP   s   @rD   r   r     s   G  )-CC C 	C
 C C C &CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/rJ   r   c                   $     e Zd Z fdZd Z xZS )HubertFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y re   )r)   r*   r+   r   activation_dropoutintermediate_dropoutr   r-   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     rD   r*   zHubertFeedForward.__init__f  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?rJ   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S re   )r   r   r   r   r   rG   s     rD   rI   zHubertFeedForward.forwards  sX    //>00?11-@))-8++M:rJ   rK   rP   s   @rD   r   r   e  s    @rJ   r   c                   &     e Zd Z fdZddZ xZS )HubertEncoderLayerc                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        y )NFr   r   r   r   rB   r   )r)   r*   r   r-   num_attention_headsattention_dropout	attentionr+   r   r   r   rk   r   rl   r   feed_forwardfinal_layer_normr   s     rD   r*   zHubertEncoderLayer.__init__~  s    (((00,,
 zz&"7"78,,v'9'9v?T?TU-f5 "V-?-?VEZEZ [rJ   c                     |}| j                  |||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S Nr   r   )r   r   rl   r   r   rA   rH   r   r   attn_residualr   _outputss           rD   rI   zHubertEncoderLayer.forward  s    %)-.L] *8 *
&|Q ]3%56%(9(9-(HH--m< "&GrJ   r   rK   rP   s   @rD   r   r   }  s    \rJ   r   c                        e Zd Z fdZ	 	 	 	 ddej
                  deej                     dededef
dZ	de
ej                  df   d	ej                  fd
Z xZS )HubertEncoderc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        d| _        y c c}w Nr   F)r)   r*   rB   r   pos_conv_embedr+   rk   r-   r   rl   r   r   r   r   r~   num_hidden_layersr   layersr   rA   rB   r   rC   s      rD   r*   zHubertEncoder.__init__  s    ;FC,,v'9'9v?T?TUzz&"7"78mmvOgOgIh$iA%7%?$ij&+# %j   !CNrH   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  ||      }| j	                  |      }	||	z   }| j                  |      }| j                  |      }t               xs t        |       }
| j                  D ]j  }|r||fz   }t        j                  g       }| j                  xr || j                  j                  k  }|r|
r ||||      }|d   }|rd}|sb|d   fz   }l |r||fz   }|st        d |||fD              S t!        |||	      S )
N ro   r   r   r   r   NNc              3   &   K   | ]	  }||  y wre   r  .0vs     rD   	<genexpr>z(HubertEncoder.forward.<locals>.<genexpr>       mq_`_lm   last_hidden_staterH   
attentions)	unsqueezerepeatr   _update_full_maskr   rl   r   r	   r
   r  r   randr   rB   	layerdropr   r   rA   rH   r   r   r  r  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr|   dropout_probabilityskip_the_layerlayer_outputss                  rD   rI   zHubertEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001//

 #11-@%(;;6]302R6LT6R[[ 	PE#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ %!.Te! !.a 0 , &9]1=M<O&O#'	P*   1]4D Dm]4EGZ$[mmm++*
 	
rJ   inputs_embedsc                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S Nflash_attention_2r   sdpaflex_attentionF)r   	rB   r   r   dtyper   r   r   r   r   rA   r   r!  s      rD   r  zHubertEncoder._update_full_mask      
 %{{//3FF343F  MQ  11V; "E^UbUhUh!i  115EEnell;%@[`%aN
  "<NML_L_!`rJ   NFFT)rL   rM   rN   r*   r   tensorr   r   r   rI   r   r  rO   rP   s   @rD   r   r     s    , 26"'%* :
||:
 !.:
  	:

 #:
 :
xellD01 ||rJ   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )HubertAttnAdapterLayerc                    t         |           |j                  | _        |j                  | _        t        j                  | j
                        | _        t        j                  | j
                  | j                        | _
        t        j                         | _        t        j                  | j                  | j
                        | _        y)z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r)   r*   adapter_attn_dim	input_dimr-   
hidden_dimr+   rk   normr   linear_1ReLUact_fnlinear_2r   s     rD   r*   zHubertAttnAdapterLayer.__init__  s    
 	00 ,,LL1			$//4>>Bggi		$..$//BrJ   rH   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S re   )r3  r4  r6  r7  rG   s     rD   rI   zHubertAttnAdapterLayer.forward  s@    		-0m4M2m4rJ   )rL   rM   rN   r*   r   FloatTensorrI   rO   rP   s   @rD   r.  r.    s    CU%6%6 rJ   r.  c                   f     e Zd Z fdZ	 	 ddej
                  deej
                     defdZ xZ	S )!HubertEncoderLayerStableLayerNormc                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        t#        |dd       t%        |      | _        y d | _        y )NFr   r   r0  )r)   r*   r   r-   r   r   r   r+   r   r   r   rk   r   rl   r   r   r   getattrr.  adapter_layerr   s     rD   r*   z*HubertEncoderLayerStableLayerNorm.__init__  s    (((00,,
 zz&"7"78,,v'9'9v?T?TU-f5 "V-?-?VEZEZ [6-t4@!7!?D!%DrJ   rH   r   r   c                 $   |}| j                  |      }| j                  |||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }| j
                  || j                  |      z   }|f}|r||fz  }|S r   )rl   r   r   r   r   r>  r   s           rD   rI   z)HubertEncoderLayerStableLayerNorm.forward+  s     &6)-.L] *8 *
&|Q ]3%5%(9(9$:O:OP]:^(__))D,>,>},MMM "&GrJ   r   )
rL   rM   rN   r*   r   r   r   r   rI   rO   rP   s   @rD   r;  r;    s>    &, 26"'	|| !.  	rJ   r;  c                   p     e Zd Z fdZ	 	 	 	 ddZdeej                  df   dej                  fdZ xZ	S )HubertEncoderStableLayerNormc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        d| _        y c c}w r   )r)   r*   rB   r   r   r+   rk   r-   r   rl   r   r   r   r   r~   r   r;  r  r   r  s      rD   r*   z%HubertEncoderStableLayerNorm.__init__F  s    ;FC,,v'9'9v?T?TUzz&"7"78mm@EfF^F^@_`1.v6`
 ',# ar  Nc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  ||      }| j	                  |      }	||	z   }| j                  |      }t               xs t        |       }
| j                  D ]j  }|r||fz   }t        j                  g       }| j                  xr || j                  j                  k  }|r|
r ||||      }|d   }|rd}|sb|d   fz   }l | j                  |      }|r||fz   }|st        d |||fD              S t!        |||	      S )
Nr  ro   r   r   r   r   r  c              3   &   K   | ]	  }||  y wre   r  r
  s     rD   r  z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>  r  r  r  )r  r  r   r  r   r   r	   r
   r  r   r  r   rB   r  rl   r   r   r  s                  rD   rI   z$HubertEncoderStableLayerNorm.forwardQ  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001//

 #11-@%(;;]302R6LT6R[[ 	PE#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ !&!.Te! !.a 0 , &9]1=M<O&O#)	P, 6 1]4D Dm]4EGZ$[mmm++*
 	
rJ   r   r!  c                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S r#  r'  r)  s      rD   r  z.HubertEncoderStableLayerNorm._update_full_mask  r*  rJ   r+  )
rL   rM   rN   r*   rI   r   r   r   r  rO   rP   s   @rD   rA  rA  E  sE    	, "<
|ellD01 ||rJ   rA  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zdeej                  ef   fdZded	ej                  fd
Zy)HubertPreTrainedModelrB   hubertr   Tc                 z   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                  t        j                  t        j                  f      rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t        j                        r_t               rddl}t#        |d      r|t#        |d      rp|j$                  j'                  |j(                  |j*                  gd      5  t        j,                  j/                  |j                  j                         ddd       n|j$                  j'                  |j                  d      5  t        j,                  j/                  |j                  j                         ddd       n3t        j,                  j/                  |j                  j                         |j                  %|j                  j                  j                          yyt        |t0              r2t#        |d	      r%|j2                  j                  j5                          yyt        |t6              rMt#        |d
      r@|j8                  j                  j                  d| j                  j:                  dz   z         yyy# 1 sw Y   xY w# 1 sw Y   xY w)zInitialize the weightsr   )meanstdNg      ?r   r<   r;   r#   masked_spec_embedlayer_weightsr   )r   r+   r   r%   datanormal_rB   initializer_ranger[   zero_rk   rv   r3   fill_r,   r	   r6   r5   r7   r8   r<   r;   initkaiming_normal_HubertModelrL  uniform_HubertForSequenceClassificationrM  r   )rA   r   r6   s      rD   _init_weightsz#HubertPreTrainedModel._init_weights  sP   fbii( MM&&CT[[5R5R&S{{&  &&( 'r||R^^ LMKK""$MM$$S)		*)+ 6:.76:3N"::FOOV__;]mn:o D//0B0BCD D #::6==XY:Z D//0B0BCD D ''(:(:;{{&  &&( ',v23((--668 4 ?@v/$$))//t{{7T7TWX7X0YZ 0 AD DD Ds   ?4L%#4L1%L.1L:input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)input_lengthr   rZ   s      rD   _conv_out_lengthzPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s"     99\K7wWZ[[[rJ   )ziprB   r_   r`   )rA   rY  r`  r   rZ   s        rD    _get_feat_extract_output_lengthsz6HubertPreTrainedModel._get_feat_extract_output_lengths  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q rJ   feature_vector_lengthr   c                    | j                  |j                  d            j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nro   r   )r(  devicer   )re  )rb  sumtor   longr   zerosr(  re  arangeflipcumsumr   )rA   rc  r   output_lengths
batch_sizes        rD   "_get_feature_vector_attention_maskz8HubertPreTrainedModel._get_feature_vector_attention_mask  s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOrJ   N)rL   rM   rN   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrX  r   r   
LongTensorr   rb  ro  r  rJ   rD   rG  rG    sh     $O&*#N[BeEDTDTVYDY>Z 
 
]b]m]m 
rJ   rG  r   	mask_probmask_length	min_masksr   c                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r_  num_masked_spanepsilonry  rx  rz  sequence_lengths     rD   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOrJ   Nro   r(  r   F)replace)r   nprandomr  itemdetachrf  tolistr~   ri  r   choicerj  lenconcatenateonesint32appendarraybroadcast_tor   r~  put_along_axis)r   rx  ry  r   rz  rn  r  r   rY  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr_  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@rD   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   *    e Zd Zdef fdZ	 	 ddej                  deej                     deej                     fdZ	e
	 	 	 	 	 ddeej                     deej                     deej                     dee   d	ee   d
ee   deeef   fd       Z xZS )rU  rB   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nr   )r)   r*   rB   ry   feature_extractorr   feature_projectionmask_time_probmask_feature_probr+   	Parameterr   r   r-   rV  rL  do_stable_layer_normrA  encoderr   	post_initr   s     rD   r*   zHubertModel.__init__f  s     !5f!="9&"A   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&7?DL(0DL 	rJ   rH   mask_time_indicesr   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rx  ry  r   rz  )re  r(  )rx  ry  rz  ro   )r=  rB   r   rL  rg  r(  r  r   r  mask_time_lengthmask_time_min_masksr   r,  re  r   r  mask_feature_lengthmask_feature_min_masksexpand)rA   rH   r  r   rn  r  r-   mask_feature_indicess           rD   _mask_hidden_stateszHubertModel._mask_hidden_statesx  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./rJ   r   r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      }| j                  ||      }| j                  |||||      }	|	d   }|s	|f|	dd z   S t        ||	j                  |	j                        S )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r  r   r   r  r  r   r  )rB   r   r  use_return_dictr  rF   ro  r   r  r  r  r   rH   r  )
rA   r   r   r  r   r  r  extract_featuresrH   encoder_outputss
             rD   rI   zHubertModel.forward  s,   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN//0@A00Rc0d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
rJ   r  NNNNN)rL   rM   rN   r   r*   r   r9  r   rw  r  r   r   r   r   r   r   rI   rO   rP   s   @rD   rU  rU  d  s    | * :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*D
u||,D
 !.D
 $E$5$56	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
rJ   rU  zn
    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddee   f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                     deej                     d	ee   d
ee   dee   deej                     deeef   fd       Z xZS )HubertForCTCtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y)a0  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r)   r*   rU  rH  r+   r   final_dropoutr   r  
vocab_sizer   rC   r5   r  output_hidden_sizer-   r   lm_headr  )rA   rB   r  r  rC   s       rD   r*   zHubertForCTC.__init__  s     	 !&)zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	rJ   c                     | j                   }|&t        | j                  dd      t        d| d      |-t        | j                  dd      t        j                  d       y|| j                  |d       yy)a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr0  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r=  rB   r   loggerinfoload_adapter)rA   r  s     rD   tie_weightszHubertForCTC.tie_weights  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %rJ   c                 X    t        j                  dt               | j                          y)
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderrA   s    rD   freeze_feature_extractorz%HubertForCTC.freeze_feature_extractor)  '    
 	Q	

 	##%rJ   c                 L    | j                   j                  j                          yr  NrH  r  r   r  s    rD   r  z#HubertForCTC.freeze_feature_encoder5      
 	%%88:rJ   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNrH  r   r   r   s     rD   freeze_base_modelzHubertForCTC.freeze_base_model<  (    
 [[++- 	(E"'E	(rJ   r   r   r   r  r  labelsr   c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }| j                  |      }| j                  |      }	d}
|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |	dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }
ddd       |s|	f|t6        d z   }|
|
f|z   S |S t9        |
|	|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r  ro   )r'   r(  r   F)enabled)blank	reductionzero_infinitylosslogitsrH   r  )rB   r  r~  r  r   rH  r   r  r   	ones_likerh  rb  rf  rg  masked_selectr+   r   log_softmaxfloat32rF   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rH   r  )rA   r   r   r   r  r  r  r   rH   r  r  rY  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    rD   rI   zHubertForCTC.forwardD  s'   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]++)/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIre   r  )rL   rM   rN   r   r   r*   r  r  r  r  r   r   r   r   r   r   r   rI   rO   rP   s   @rD   r  r    s    HSM :<*
&;(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
rJ   r  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee	j                     deeef   fd       Z xZS )rW  c                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nr  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )r)   r*   r5   r  r   rU  rH  r   use_weighted_layer_sumr+   r  r   r  rM  r   r-   classifier_proj_size	projector
num_labels
classifierr  )rA   rB   
num_layersrC   s      rD   r*   z(HubertForSequenceClassification.__init__  s     6=)f.@.@o  "&)--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	rJ   c                 X    t        j                  dt               | j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r  Nr  r  s    rD   r  z8HubertForSequenceClassification.freeze_feature_extractor  r  rJ   c                 L    | j                   j                  j                          yr  r  r  s    rD   r  z6HubertForSequenceClassification.freeze_feature_encoder  r  rJ   c                 P    | j                   j                         D ]	  }d|_         yr  r  r   s     rD   r  z1HubertForSequenceClassification.freeze_base_model  r  rJ   r   r   r   r  r  r  r   c                 <   ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }||j                  d      }
n| j                  |j                   d   |      }|j#                  d      j%                  dd|j                   d         }d	|| <   |j                  d      |j                  d      j                  dd      z  }
| j'                  |
      }d}|Ft)               } ||j                  d| j                   j*                        |j                  d            }|s|f|t        d z   }||f|z   S |S t-        |||j.                  |j0                  
      S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   ro   r   r   r   r  )rB   r  r  rH  r  r   stackr+   r   r   rM  r   rf  r  rJ  ro  r   r  r  r  r   r  r   rH   r  )rA   r   r   r   r  r  r  r   rH   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    rD   rI   z'HubertForSequenceClassification.forward  s   . &1%<k$++B]B]'+{{'I'ItOc++)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M../)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
rJ   r  )rL   rM   rN   r*   r  r  r  r   r   r   r   r   r   r   r   rI   rO   rP   s   @rD   rW  rW    s    "
&;(  26,0/3&*)-B
u||,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
rJ   rW  )r  rW  rU  rG  )Nr   NrU   )Ir  typingr   r   r   numpyr  r   torch.nnr+   r   activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   r4   r   r   r   configuration_hubertr   integrations.flex_attentionr   
get_loggerrL   r  Moduler   r>   rW   rh   rq   ry   r   r   r   r   r   r   r   r   r.  r;  rA  rG  r   r   rw  ndarrayr  rU  r  r  rW  __all__r  rJ   rD   <module>r     s  ,  , ,    % ! @ 7 g B 9 Y Y F & J J .  !J 
		H	%/BII /d !; *9 69 0#299 #Lbii 0  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<U/bii U/p		 0!3 !HZBII ZzRYY 2+(B +\^299 ^B CO C CT 26tc?tt t U--.	t
 t ZZtn F
' F
 F
R !"  
S
( S

S
l p
&; p
p
f frJ   