
    rhG                       d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlmc mZ ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  e#jR                  e*      Z+d Z,d Z-d Z.dbdZ/de
j`                  de
j`                  fdZ1e e!d       G d de                     Z2e e!d       G d de                     Z3ee! G d de                     Z4 G d d ejj                        Z6 G d! d"ejj                        Z7 G d# d$ejj                        Z8 G d% d&ejj                        Z9 G d' d(ejj                        Z: G d) d*ejj                        Z; G d+ d,ejj                        Z< G d- d.ejj                        Z= G d/ d0ejj                        Z> G d1 d2e      Z? G d3 d4ejj                        Z@ G d5 d6ejj                        ZA G d7 d8ejj                        ZB G d9 d:ejj                        ZC	 	 dcd;ejj                  d<e
j`                  d=e
j`                  d>e
j`                  d?ee
j`                     d@eDdAeDdBee
j`                     fdCZE G dD dEejj                        ZF G dF dGejj                        ZG G dH dIejj                        ZH G dJ dKejj                        ZI G dL dMejj                        ZJ G dN dOe      ZK G dP dQejj                        ZL G dR dSejj                        ZMe! G dT dUe             ZN G dV dWeN      ZO e!dX       G dY dZeN             ZPe! G d[ d\eN             ZQe! G d] d^eN             ZRe! G d_ d`eN             ZSg daZTy)dzPyTorch CLAP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     | j                   \  }}}| dddddddf   j                  dd|d      }|j                  |||z  |      }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.pyinterpolater)   +   sX     .;-@-@*ZkaD!m,33Aq%CI!!*kE.A;OI    c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r	            r   viewpermute
contiguous)r"   window_sizer$   heightwidthnum_channelswindowss          r(   window_partitionr9   <   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr*   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r/   r   r   r	   r,   r-   r.   r0   )r8   r4   r5   r6   r7   s        r(   window_reverser;   Q   sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr*   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicess        r(   "create_position_ids_from_input_idsrJ   e   sW     <<$((*D <<!4<<TBE[[_cc##%33r*   logitsreturnc                     t        j                  t        |       | j                        }t        j
                  j                  | |      S )Ndevice)rA   arangelenrO   r   
functionalcross_entropy)rK   labelss     r(   contrastive_lossrU   w   s1    \\#f+fmm<F==&&vv66r*   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r"   
attentions)__name__
__module____qualname____doc__rY   r   rA   FloatTensor__annotations__rZ   r"   tupler[    r*   r(   rX   rX   |   sr    
 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r*   rX   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsrZ   .r"   r[   )r\   r]   r^   r_   rf   r   rA   r`   ra   rZ   r"   rb   r[   rc   r*   r(   re   re      sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r*   re   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrY   rf   text_model_outputaudio_model_outputrL   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rl   rm   N)getattrto_tuple).0kselfs     r(   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   s=      
  KKDGQXY]_`QaQjQjQll
s   -0)rb   keysrt   s   `r(   rq   zClapOutput.to_tuple   s#     
YY[
 
 	
r*   )r\   r]   r^   r_   ri   r   rA   r`   ra   rj   rk   rY   rf   rl   r   rm   rb   r   rq   rc   r*   r(   rh   rh      s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448185929
%* 
r*   rh   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    c                 0    t         |           || _        y N)super__init__	drop_prob)rt   r~   	__class__s     r(   r}   zClapDropPath.__init__   s    "r*   c                 J   | j                   dk(  s| j                  s|S d| j                   z
  }|j                  d   fd|j                  dz
  z  z   }|t	        j
                  ||j                  |j                        z   }|j                          |j                  |      |z  }|S )N        r   r   )r   dtyperO   )
r~   trainingr   ndimrA   randr   rO   floor_div)rt   r"   	keep_probr   random_tensoroutputs         r(   forwardzClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJuM<O<OXeXlXl$mm""9-=r*   r{   )r\   r]   r^   r_   r}   r   __classcell__r   s   @r(   ry   ry      s    
#r*   ry   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t         |           |j                  }|j                  }t	        ||z        }t        j                  t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _
        t        j                  t        j                  d      t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _        t        j                         | _        y )Nr   r   kernel_sizestridepaddingT)inplace)r|   r}   patch_embeds_hidden_sizeaff_block_rr@   r   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)rt   r   channelsdownsize_ratiointer_channelsr   s        r(   r}   zClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 zz|r*   c                     ||z   }| j                  |      | j                  |      z   }| j                  |      }d|z  |z  d|z  d|z
  z  z   }|S )Nr,   r   )r   r   r   )rt   r"   residualattention_inputfused_layer_outputr   s         r(   r   zClapAudioAFFBlock.forward   sb    '(2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar*   r\   r]   r^   r_   r   r}   r   r   r   s   @r(   r   r      s    
$ $0r*   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                    t         |           t        |j                  t              r|j                  |j                  fn|j                  }t        |j
                  t              r|j
                  |j
                  fn|j
                  }t        |j                  t              r|j                  |j                  fn|j                  }|| _        || _        |d   |d   z  |d   |d   z  f| _        | j                  d   | j                  d   z  | _	        |j                  | _        |j                  | _        |d   |d   z
  dz  |d   |d   z
  dz  f}| j                  r|j                  dk(  rdnd}t        j                  |j                   |z  |j"                  |||      | _        |j&                  rt        j(                  |j"                        nt        j*                         | _        | j                  rZt/        |      | _        t        j                  |j                   |j"                  |d   |d   dz  f|d   |d   dz  f|      | _        y y )Nr   r   r,   channel_mapr-   r   r	   )r|   r}   
isinstance	spec_sizer@   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)rt   r   r   r   r   r   scale_factorr   s          r(   r}   zClapAudioPatchEmbed.__init__  s+   ;EfFVFVX[;\F$$f&6&67bhbrbr6@ARARTW6XV 1 12^d^o^o 	 ;EVEXEXZ]:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab!//f6H6HM6Yq`aII--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r*   c                    | j                   r|d d ddd d d d f   }|j                  \  }}}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }|j                  d      }t        |      dkD  r||dd d d d d f   j                         }	|	j                  \  }}}}|	j                  ||z  d||      }	| j                  |	      }	|	j                  \  }
}}}|	j                  |||||      }	|	j                  d      j                         j                  d	      }	|	j                  d      }t        j                  j                  j                  |	d||z
  fd
d      }	| j!                  ||   |	      ||<   |}nx|j                  \  }
}
}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }| j                  r!|j                  d      j#                  dd      }| j%                  |      }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r/   )r   r,   r	   r   r-   r	   constantr,   )r   r   r   
ValueErrorr   sizerQ   r3   r1   r   r2   r   rA   r   rR   padr   	transposer   )rt   r"   is_longer_idxglobal_hidden_statesr$   r7   r5   r6   output_widthlocal_hidden_states_featureslocal_widths                r(   r   zClapAudioPatchEmbed.forward9  s   #0AaCA#>  7K6P6P3Jfeq))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&3M12q!4K&L&W&W&Y#:M:S:S7
L&%&9&>&>zL?XZ[]cej&k#&*oo6I&J#-@-F-F*8VU&9&>&>z<Yacikp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\K-G)H*VW'# 7;6G6G(79L7$]3 1M"/"5"5Aq&%q))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r*   r{   r   r   s   @r(   r   r   	  s    
( (T/r*   r   c                        e Zd Z fdZ	 	 	 ddej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )	ClapAudioSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        t        j"                  | j                  d         }t        j"                  | j                  d         }t        j$                  t'        ||gd            }t        j(                  |d      }|d d d d d f   |d d d d d f   z
  }	|	j+                  ddd      j-                         }	|	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   d| j                  d   z  dz
  z  cc<   |	j/                  d	      }
| j1                  d
|
       t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j<                  |j>                        | _         y )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r,   r   ij)indexingr/   relative_position_indexbias)!r|   r}   r   num_attention_headsr@   attention_head_sizeall_head_sizer   collectionsabcIterabler4   r   	ParameterrA   zerosrelative_position_bias_tablerP   stackr   r   r2   r3   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)rt   r   r>   	num_headsr4   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r   s              r(   r}   zClapAudioSelfAttention.__init__m  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr*   r"   attention_mask	head_maskoutput_attentionsrL   c                    |j                   \  }}}||d| j                  f}| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }t        j                  |	|
j	                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j                         }||j!                  d      z   }|r|j                   d   }|j                  ||z  || j"                  ||      }||j!                  d      j!                  d      z   }|j                  d| j"                  ||      }t$        j&                  j)                  |d      }| j+                  |      }|||z  }t        j                  ||      }|j                  dddd      j                         }|j-                         d d | j.                  fz   }|j                  |      }|r||f}|S |f}|S )Nr/   r   r,   r   r=   r	   )r   r   r   r1   r   r   r   rA   matmulmathsqrtr   r   r4   r2   r3   	unsqueezer   r   rR   softmaxr   r   r   )rt   r"   r   r   r   r$   r>   r7   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r(   r   zClapAudioSelfAttention.forward  s    )6(;(;%
C"CT-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r*   NNFr\   r]   r^   r}   rA   Tensorr   r`   boolrb   r   r   r   s   @r(   r   r   l  sq    #GP 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6r*   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapAudioSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y r{   )r|   r}   r   r   denser   r   r   rt   r   r>   r   s      r(   r}   zClapAudioSelfOutput.__init__  s6    YYsC(
zz&"E"EFr*   r"   input_tensorrL   c                 J    | j                  |      }| j                  |      }|S r{   r  r   rt   r"   r  s      r(   r   zClapAudioSelfOutput.forward  s$    

=1]3r*   r\   r]   r^   r}   rA   r  r   r   r   s   @r(   r  r    s2    G
U\\  RWR^R^ r*   r  c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
ClapAudioAttentionc                     t         |           t        ||||      | _        t	        ||      | _        t               | _        y r{   )r|   r}   r   rt   r  r   setpruned_heads)rt   r   r>   r   r4   r   s        r(   r}   zClapAudioAttention.__init__  s8    *63	;O	)&#6Er*   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y Nr   r   r=   rQ   r   rt   r   r   r  r   r   r   r   r   r  r   unionrt   headsindexs      r(   prune_headszClapAudioAttention.prune_heads     u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r*   r"   r   r   r   rL   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   rt   r   )rt   r"   r   r   r   self_outputsattention_outputr
  s           r(   r   zClapAudioAttention.forward  sG     yy	K\];;|AF#%QR(88r*   r  r\   r]   r^   r}   r%  rA   r  r   r`   r  rb   r   r   r   s   @r(   r  r    st    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
r*   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r{   )r|   r}   r   r   r@   	mlp_ratior  r   
hidden_actstrr
   intermediate_act_fnr  s      r(   r}   zClapAudioIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r*   r"   rL   c                 J    | j                  |      }| j                  |      }|S r{   r  r2  rt   r"   s     r(   r   zClapAudioIntermediate.forward
  &    

=100?r*   r  r   s   @r(   r-  r-    #    9U\\ ell r*   r-  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y r{   )
r|   r}   r   r   r@   r/  r  r   hidden_dropout_probr   r  s      r(   r}   zClapAudioOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r*   r"   rL   c                 J    | j                  |      }| j                  |      }|S r{   r  r5  s     r(   r   zClapAudioOutput.forward  s$    

=1]3r*   r  r   s   @r(   r9  r9    s#    >
U\\ ell r*   r9  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 ddej                  de	e
e
f   deej                     dee   d	ee   d
e	ej                  ej                  f   fdZ xZS )ClapAudioLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )Neps)r4   r   )r|   r}   chunk_size_feed_forward
shift_sizer4   input_resolutionr   r   layer_norm_epslayernorm_beforer  	attentionry   r   	drop_pathlayernorm_afterr-  intermediater9  r   )rt   r   r>   rD  r   drop_path_raterC  r   s          r(   r}   zClapAudioLayer.__init__  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a9G#9Mn5SUS^S^S`!||CV5J5JK1&#>%fc2r*   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr4   r   rC  rA   jit
is_tracingtensor)rt   rD  s     r(   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_size,  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r*   c           	         | j                   dkD  rht        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   r   r/   r,   g      Yr   )	rC  rA   r   slicer4   r9   r1   r   masked_fill)rt   r5   r6   r   rO   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r(   get_attn_maskzClapAudioLayer.get_attn_mask4  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir*   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS rM  )r4   r   rR   r   )rt   r"   r5   r6   	pad_right
pad_bottom
pad_valuess          r(   	maybe_padzClapAudioLayer.maybe_padP  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r*   r"   input_dimensionsr   r   always_partitionrL   c                    |s| j                  |       n	 |\  }}|j                         \  }}	}
|}| j                  |      }|j                  ||||
      }| j	                  |||      \  }}|j
                  \  }	}}}	| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |
      }| j                  |||j                  |j                        }| j                  ||||      }|d   }|j                  d| j                  | j                  |
      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |
      }|| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r,   )shiftsdimsr/   r   )r   r	   r.   r   )rR  r   rF  r1   rc  r   rC  rA   rollr9   r4   r^  r   rO   rG  r;   r3   rH  rI  rJ  r   )rt   r"   rd  r   r   re  r5   r6   r$   r   r   shortcutrb  
height_pad	width_padshifted_hidden_stateshidden_states_windowsr]  attention_outputsr*  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r(   r   zClapAudioLayer.forwardW  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN!9iK\ + 
 -Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr*   )r   r   NFF)r\   r]   r^   r}   rR  r^  rc  rA   r  rb   r@   r   r`   r  r   r   r   s   @r(   r>  r>    s    38) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*Ar*   r>  c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   deej                     dee
   dee
   deej
                     fdZ xZS )
ClapAudioStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr,   r   )r   r>   rD  r   rK  rC  )r>   
norm_layerF)r|   r}   r   r>   r   
ModuleListranger>  r4   blocksr   
downsamplepointing)
rt   r   r>   rD  depthr   rH  r}  ir   s
            r(   r}   zClapAudioStage.__init__  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r"   rd  r   r   re  rL   c                    |\  }}t        | j                        D ]  \  }}	|||   nd }
 |	|||
||      }|d   }! |}| j                  )|dz   dz  |dz   dz  }}||||f}| j                  ||      }n||||f}|||f}|r|dd  z  }|S )Nr   r   r,   )	enumerater|  r}  )rt   r"   rd  r   r   re  r5   r6   r  layer_modulelayer_head_maskrt  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                    r(   r   zClapAudioStage.forward  s     )(5 	-OA|.7.CilO(/BSUeM *!,M	- -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr*   ru  )r\   r]   r^   r}   rA   r  rb   r@   r   r`   r  r   r   r   s   @r(   rw  rw    sz    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	r*   rw  c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    rD  r>   ry  rL   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr-   r,   Fr   )r|   r}   rD  r>   r   r   	reductionr   )rt   rD  r>   ry  r   s       r(   r}   zClapAudioPatchMerging.__init__  sI     01s7AG%@q3w'	r*   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr,   r   r   )r   rR   r   )rt   input_featurer5   r6   
should_padrb  s         r(   rc  zClapAudioPatchMerging.maybe_pad  sU    qjAo:519>
Q519a!<JMM--mZHMr*   r  rd  c                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r,   r   r/   r-   )r   r1   rc  rA   catr   r  )rt   r  rd  r5   r6   r$   r>   r7   input_feature_0input_feature_1input_feature_2input_feature_3s               r(   r   zClapAudioPatchMerging.forward  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r*   )r\   r]   r^   r_   r   r   rb   r@   Moduler}   rc  rA   r  r   r   r   s   @r(   r  r    sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r*   r  c                        e Zd Z fdZd Z	 	 	 	 	 	 	 ddeej                     deej                     dee   dee   dee   dee   d	ee   d
e	e
ef   fdZ xZS )ClapAudioEncoderc                    t         |           t        |j                        | _        || _        t        |      | _        |j                  | _        | j                  j                  | _	        |j                  | _
        |j                  |j                  z  | _        t        |j                  d| j                  dz
  z  z        | _        t!        j"                  d|j$                  t'        |j                        d      D cg c]  }|j)                          }}| j                  j*                  }t-        | j                        D cg c]  }|d   d|z  z  |d   d|z  z  f c}| _        t1        j2                  t-        | j                        D cg c]  }t5        |t        |j                  d|z  z        | j.                  |   |j                  |   |j6                  |   |t'        |j                  d |       t'        |j                  d |dz           || j                  dz
  k  rt8        nd        c}      | _        d| _        t1        j>                  |j                        | _         t1        jB                  | j                        | _"        |j                  | _        t1        jF                  d      | _$        y c c}w c c}w c c}w )Nr,   r   r   cpurN   )r   r>   rD  r  r   rH  r}  F)%r|   r}   rQ   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior@   r   num_featuresrA   linspacerK  r   itemr   r{  input_resolutionsr   rz  rw  r   r  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)rt   r   xrK  r   r  i_layerr   s          r(   r}   zClapAudioEncoder.__init__  sW   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vwq!&&(ww$$..	\abfbqbq\r!sWX9Q<AqD#99Q<AqD;Q"R!smm  %T__5  !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@4??UVCV9V4]a
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   J<KB#Kc                    |j                   \  }}}}t        | j                  | j                  z        }| j                  | j                  z  }||kD  s||kD  rt	        d      ||k  r%t
        j                  j                  |||fdd      }||k  r%t
        j                  j                  |||fdd      }|j                   \  }}}	}
|j                  ||| j                  z  |	| j                  z  |
      }|j                  dddd      j                         }|j                  |||
| j                  z  |	| j                  z        }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r	   r,   )r   r@   r   r  r   r   rR   r)   r!   r2   r3   )rt   normalized_input_featuresr   r%   freq_length
spec_widthspec_heightbatchr   timefreqs              r(   reshape_mel2imgz ClapAudioEncoder.reshape_mel2img8  s`   
 *C)H)H&1k;$//9:
nn7#{['@_`` #(*(A(A)J+D9dh )B )% $(*(A(A)K+EIei )B )% '@&E&E#xt %>$E$E8doo-tt/F%
! %>$E$EaAq$Q$\$\$^!$=$E$E8TDOO3TT__5L%
! )(r*   	is_longerr   r   output_hidden_states(output_hidden_states_before_downsamplingre  return_dictrL   c	                 &   |j                  dd      }| j                  |      }	|	j                  dd      }	d }
| j                  r6|j                  |j                        }t        j                  |dk(        d   }
| j                  |	      }|j                  d   }| j                  ||
      }|rdnd }|rdnd }|rdnd }| j                  d   }|rE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }}|||   nd }| j                  |   } ||||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                  \  }}} |j                  |g|d   |d   f| }|j                  dddd      }||fz  }||fz  }nI|rG|sE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }|s||dd  z  } | j                  |      }|j                  \  }}}|dt!        | j"                        dz
  z  z  | j$                  d   z  }|dt!        | j"                        dz
  z  z  | j$                  d   z  }|j                  ddd      j'                         j)                  ||||      }|j                  \  }}} }!| | j*                  z  }"|j)                  ||| |"z  |"|!      }|j                  ddddd      j'                         j)                  |||"d      }| j-                  t        j.                  |d            }#t        j.                  |#d      }#|st1        d	 ||#||fD              S t3        ||#||
      S )Nr   r	   r   r,   rc   r   r/   r-   c              3   $   K   | ]  }|| 
 y wr{   rc   )rr   vs     r(   ru   z+ClapAudioEncoder.forward.<locals>.<genexpr>  s      	 = 	s   rZ   pooler_outputr"   r[   )r   r  r   torO   rA   wherer  r   r  r  r1   r2   r  r  r   rQ   r  r   r3   r!   r  r  r   rb   r   )$rt   input_featuresr  r   r   r  r  re  r  r  is_longer_list_idxis_longer_listr"   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrd  r$   r   hidden_sizereshaped_hidden_stater  r  r  rt  r  r  rZ   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs$                                       r(   r   zClapAudioEncoder.forward\  s    (11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((8JK"6BD+?RT"$5b411!4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 #	9OA|.7.CilO#55a8(/BSUeM *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#G#	9J !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7 	 &!.'		 	 	 */'4*	
 	
r*   )NNFFFFT)r\   r]   r^   r}   r  r   rA   r`   r  r   rb   re   r   r   r   s   @r(   r  r    s    &/P")N 2615,1/4CH+0&*u
 E--.u
 E--.	u

 $D>u
 'tnu
 3;4.u
 #4.u
 d^u
 
u**	+u
r*   r  c                   4     e Zd Zdeeef   f fdZd Z xZS )ClapProjectionLayerr   c                     t         |           || _        |j                  }|j                  }t        j                  ||      | _        t        |j                     | _
        t        j                  ||      | _        y r{   )r|   r}   r   r  projection_dimr   r   linear1r
   projection_hidden_act
activationlinear2)rt   r   r  r  r   s       r(   r}   zClapProjectionLayer.__init__  sa    ((..yyn= !=!=>yy@r*   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r{   )r  r  r  r5  s     r(   r   zClapProjectionLayer.forward  s2    ]36]3r*   )	r\   r]   r^   r   r   r   r}   r   r   r   s   @r(   r  r    s     Au_n%DE Ar*   r  c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )ClapTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)rF   r@  position_embedding_typeabsoluteposition_ids)r   r/   T)
persistenttoken_type_ids)r   )r|   r}   r   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   rE  r   r;  r   rp   r  r   rA   rP   expandr   r  r   rD   rF   rt   r   r   s     r(   r}   zClapTextEmbeddings.__init__  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	

 "..#%<<**F,>,>DL\L\$
 r*   c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr/   r   r  r   r   r  )rJ   rF   &create_position_ids_from_inputs_embedsr   hasattrr  r  rA   r   rD   r  rO   r  r  r  r  r   r   )rt   rE   r  r  inputs_embedsrG   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  s                r(   r   zClapTextEmbeddings.forward  sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r*   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr/   r   r   r   )r   rA   rP   rF   rD   rO   r   r  )rt   r  r  sequence_lengthr  s        r(   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds.  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r*   )NNNNr   )r\   r]   r^   r_   r}   r   r  r   r   s   @r(   r  r    s    

4 rs&P=r*   r  moduler   r   r   r   scalingr   r   c                 .   t        j                  ||j                  dd            |z  }	|#|d d d d d d d |j                  d   f   }
|	|
z   }	t        j
                  j                  |	dt         j                        j                  |j                        }	t        j
                  j                  |	|| j                        }	||	|j                  dddd      z  }	t        j                  |	|      }|j                  dd      j                         }||	fS )Nr,   r	   r   r/   )r>   r   )pr   r   )rA   r   r   r   r   rR   r   float32r  r   r   r   r1   r3   )r  r   r   r   r   r  r   r   kwargsattn_weightscausal_maskattn_outputs               r(   eager_attention_forwardr  A  s     <<s}}Q':;gEL!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L#innQAq&AA,,|U3K''1-88:K$$r*   c                        e Zd Z fdZ	 	 	 ddej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )	ClapTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizer   r   r         )r|   r}   r  r   r  r   r   r@   r   r   r   r   r   r   r   r   r   r   attention_dropoutr  r  s     r(   r}   zClapTextSelfAttention.__init__^  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r*   r"   r   r   r   rL   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|f| j                  sdn| j                  | j                  |d|\  }} |j                  g |d j                         }|r||f}|S |f}|S )Nr/   r   r,   eagerr   )r   r  r   )r   r   r   r1   r   r   r   r  r   _attn_implementationr   r   r  r  r!   r3   )rt   r"   r   r   r   r  r  r   query_states
key_statesvalue_statesattention_interfacer  r  r
  s                  r(   r   zClapTextSelfAttention.forwards  sa    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL
%
 
%
!\ *k));;;;FFH1B;- JUr*   r  r  r   s   @r(   r  r  ]  so    60 7;15,1!||! !!2!23! E--.	!
 $D>! 
u||	!r*   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr@  )r|   r}   r   r   r  r  r   rE  r   r;  r   r  s     r(   r}   zClapTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r*   r"   r  rL   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r{   r  r   r   r  s      r(   r   zClapTextSelfOutput.forward  7    

=1]3}|'CDr*   r  r   s   @r(   r  r    1    >U\\  RWR^R^ r*   r  c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
ClapTextAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r{   )r|   r}   r  rt   r  r   r  r  r  s     r(   r}   zClapTextAttention.__init__  s0    )&1	(0Er*   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y r  r   r"  s      r(   r%  zClapTextAttention.prune_heads  r&  r*   r"   r   r   r   rL   c                 p     | j                   |f|||d|}| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   r   r(  )	rt   r"   r   r   r   r  r)  r*  r
  s	            r(   r   zClapTextAttention.forward  s_     !tyy
)/	

 
  ;;|AF#%QR(88r*   r  r+  r   s   @r(   r  r    st    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	r*   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r{   )r|   r}   r   r   r  intermediate_sizer  r   r0  r1  r
   r2  r  s     r(   r}   zClapTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r"   rL   c                 J    | j                  |      }| j                  |      }|S r{   r4  r5  s     r(   r   zClapTextIntermediate.forward  r6  r*   r  r   s   @r(   r  r    r7  r*   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r  )r|   r}   r   r   r!  r  r  r   rE  r   r;  r   r  s     r(   r}   zClapTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r"   r  rL   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r{   r  r  s      r(   r   zClapTextOutput.forward  r  r*   r  r   s   @r(   r$  r$    r  r*   r$  c                        e Zd Z fdZ	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
d Z xZS )
ClapTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
r|   r}   rB  seq_len_dimr  rG  r  rJ  r$  r   r  s     r(   r}   zClapTextLayer.__init__  sI    '-'E'E$*6208$V,r*   r"   r   r   r   rL   c                      | j                   |f|||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }	|	f|z   }|S r  )rG  r   feed_forward_chunkrB  r*  )
rt   r"   r   r   r   r  self_attention_outputsr*  r
  rs  s
             r(   r   zClapTextLayer.forward  s     "0"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r*   c                 L    | j                  |      }| j                  ||      }|S r{   )rJ  r   )rt   r*  intermediate_outputrs  s       r(   r,  z ClapTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr*   r  )r\   r]   r^   r}   rA   r  r   r`   r  rb   r   r,  r   r   s   @r(   r(  r(    st    - 7;15,1|| !!2!23 E--.	
 $D> 
u||	2r*   r(  c                        e Zd Z fdZe	 	 	 	 	 d
dej                  deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )ClapTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r|   r}   r   r   rz  r{  num_hidden_layersr(  layerr  )rt   r   r  r   s      r(   r}   zClapTextEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#r"   r   r   r   r  r  rL   c           	          |rdnd }|rdnd }	t        | j                        D ]4  \  }
}|r||fz   }|||
   nd } |d||||d|}|d   }|s,|	|d   fz   }	6 |r||fz   }t        |||	      S )Nrc   )r"   r   r   r   r   r   )rZ   r"   r[   )r  r4  r   )rt   r"   r   r   r   r  r  r  r  r  r  r  r  rt  s                 r(   r   zClapTextEncoder.forward$  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO( +-)"3	
 M *!,M &9]1=M<O&O#!	P$   1]4D D++*
 	
r*   )NNFFT)r\   r]   r^   r}   r   rA   r  r   r`   r  r   rb   r   r   r   r   s   @r(   r1  r1    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r*   r1  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r{   )r|   r}   r   r   r  r  Tanhr  r  s     r(   r}   zClapTextPooler.__init__P  s9    YYv1163E3EF
'')r*   r"   rL   c                 \    |d d df   }| j                  |      }| j                  |      }|S rM  )r  r  )rt   r"   first_token_tensorpooled_outputs       r(   r   zClapTextPooler.forwardU  s6     +1a40

#566r*   r  r   s   @r(   r7  r7  O  s#    $
U\\ ell r*   r7  c                   @    e Zd ZU eed<   dZdZdej                  fdZ	y)ClapPreTrainedModelr   clapFr  c                    | j                   j                  }t        |t              ri|j                  j
                  j                  j                  d|dz         |j                  j
                  j                  j                  d|dz         yt        |t              r|j                  j                  j                  t        j                  | j                   j                               |j                  j                  j                  t        j                  | j                   j                               yt        |t         j"                        r+|j
                  j                  j                  d|dz         yt        |t         j$                  t         j&                  f      rJ|j(                  j                  j+                          |j
                  j                  j                  d       yt        |t         j,                  t         j.                  f      r| j                   j0                  dz  d| j                   j2                  z  dz  z  |z  }t         j4                  j                  |j
                  |       |j(                  %|j(                  j                  j+                          yyt        |t6              r%|j8                  j                  j+                          yy)	zInitialize the weightsr   g{Gz?)meanstdg      ?r  r,   )rB  N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModellogit_scale_afill_r   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   zero_r   r   r  r3  initr   r   )rt   r  factorin_proj_stds       r(   _init_weightsz!ClapPreTrainedModel._init_weightsd  s   //f01&&--22::RV:W((//44<<#6TX=<Y	*  %%++DHHT[[5W5W,XY  %%++DHHT[[5W5W,XY-MM&&CVd]&Cr~~ >?KK""$MM$$S)BII 67;;22D8a$++B_B_>_dh=hilrrKGGOOFMM{O;{{&  &&( ' 67//44::< 8r*   N)
r\   r]   r^   r   ra   base_model_prefixsupports_gradient_checkpointingr   r  rQ  rc   r*   r(   r>  r>  ^  s$    &+#=BII =r*   r>  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     deej                     dee   dee   d	ee   deeef   fd
       Z xZS )ClapAudioModelr   r  c                 d    t         |   |       t        |      | _        | j	                          y r{   )r|   r}   r  audio_encoder	post_initr  s     r(   r}   zClapAudioModel.__init__  s'     -f5r*   rL   c                 B    | j                   j                  j                  S r{   )rW  r  r   rw   s    r(   get_input_embeddingsz#ClapAudioModel.get_input_embeddings  s    !!--222r*   r  r   r  r  c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      S )ae  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```r  r  r   r  r  )r   use_return_dictr   r  rW  )rt   r  r  r   r  r  s         r(   r   zClapAudioModel.forward  sx    > &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 !!)/!5# " 
 	
r*   NNNNN)r\   r]   r^   r   ra   main_input_namer}   r   r  rZ  r   r   rA   r`   
BoolTensorr  r   rb   r   r   r   r   s   @r(   rU  rU  |  s    &O 3bii 3  7;04,0/3&**
 !2!23*
 E,,-*
 $D>	*

 'tn*
 d^*
 
u00	1*
 *
r*   rU  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   b    e Zd ZU eed<   d fd	Zd Zd Zee		 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
e   de
e   de
e   deeej                     ef   fd              Z xZS )ClapTextModelr   c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r|   r}   r   r  r  r1  encoderr7  poolerrX  )rt   r   add_pooling_layerr   s      r(   r}   zClapTextModel.__init__  sM    
 	 ,V4&v.0AnV,t 	r*   c                 .    | j                   j                  S r{   r  r  rw   s    r(   rZ  z"ClapTextModel.get_input_embeddings  s    ...r*   c                 &    || j                   _        y r{   rh  rt   r   s     r(   set_input_embeddingsz"ClapTextModel.set_input_embeddings  s    */'r*   rE   r   r  r  r   r  r   r  r  rL   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j#                  || j                   j$                        }| j                  ||||      }| j'                  |||||d	      }|d
   }| j(                  | j)                  |      nd }t+        |||j,                  |j.                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer/   z5You have to specify either input_ids or inputs_embedsrN   r  r   )rE   r  r  r  T)r   r   r   r  r  r   r  )r   r   r  r]  r   %warn_if_padding_and_no_attention_maskr   rO   rA   onesr  r  r  r  r   rD   get_extended_attention_maskget_head_maskr3  rd  re  r   r"   r[   )rt   rE   r   r  r  r   r  r   r  r  r  r$   r  rO   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputr<  s                        r(   r   zClapTextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r*   )T	NNNNNNNNN)r\   r]   r^   r   ra   r}   rZ  rk  r   r   r   rA   r  r  r   rb   r   r   r   r   s   @r(   rb  rb    s     /0  -11515/3,004,0/3&*G
ELL)G
 !.G
 !.	G

 u||,G
 ELL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\\"$PP	QG
  G
r*   rb  c                       e Zd ZU eed<   def fdZe	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd
       Ze	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd       Zee	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                      deej                     deej                     dee
   dee
   dee
   dee
   d	eeef   fd              Z xZS )rG  r   c                 .   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  t        j                  t        j                  |j                                    | _        t        j                  t        j                  t        j                  |j                                    | _        |j$                  | _        t'        |      | _        t+        |      | _        t/        |      | _        t+        |      | _        | j5                          y )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )r|   r}   r   text_configr   	TypeErrortypeaudio_configr   r   r   rA   rQ  r   rJ  rK  rH  rL  r  rb  
text_modelr  text_projectionrU  audio_modelaudio_projectionrX  )rt   r   ry  r|  r   s       r(   r}   zClapModel.__init__.  s=    &,,n=++,-Q0 
 &--?,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r*   rE   r   r  r   r  r  rL   c                 F   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      }||d   n|j
                  }| j                  |      }	t        j                  |	d      }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```rE   r   r  r   r  r  r   r/   r=   )	r   r   r  r]  r}  r  r~  F	normalize)
rt   rE   r   r  r   r  r  text_outputsr<  text_featuress
             r(   get_text_featureszClapModel.get_text_featuresN  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 ,7+BQHbHb,,];Mr:r*   r  r  c                 @   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||      }|s|d   n|j
                  }| j                  |      }	t        j                  |	d      }	|	S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```python
        >>> from transformers import AutoFeatureExtractor, ClapModel
        >>> import torch

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))
        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> audio_features = model.get_audio_features(**inputs)
        ```)r  r  r  r   r/   r=   )	r   r   r  r]  r  r  r  r  r  )
rt   r  r  r   r   r  r  audio_outputsr<  audio_featuress
             r(   get_audio_featureszClapModel.get_audio_features~  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()# ) 
 1<a(A\A\..}=^<r*   return_lossc
           	      l   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  ||||d      }
| j                  |||||d      }|	s|
d   n|
j                  }| j                  |      }|	s|d   n|j                  }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }| j                  j                         }t        j                  ||j                               |z  }t        j                  ||j                               |z  }d}|r,t!        |      }t!        |j                               }||z   d	z  }t#        |||||||

      S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]

        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```NTr\  r  r   r,   r/   )r  r>   keepdimg       @)ri   rj   rk   rY   rf   rl   rm   )r   r   r  r]  r  r}  r  r  r~  r   rL  exprH  rA   r   trU   rh   )rt   rE   r  r  r   r  r  r   r  r  r  r  rf   rY   logit_scale_textlogit_scale_audiork   rj   ri   caption_loss
audio_losss                        r(   r   zClapModel.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()/!5 ) 
 )%/!5 ' 
 0;}Q'@[@[,,\:-8l1ol>X>X**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  --113 ..224,,{LNN4DEHXX <<kmmoFIZZ+O<L)*:*<*<*>?J :-4D-+#%*,
 	
r*   NNNNNNru  )r\   r]   r^   r   ra   r}   r   r   rA   r  r  r`   r  r  r   
LongTensorr`  r   rb   rh   r   r   r   s   @r(   rG  rG  *  s@   z @  -115/3,0/3&*-ELL)- !.- u||,	-
 $D>- 'tn- d^- 
		- -^  26,015,0/3&*/ ./ ELL)/ !.	/
 $D>/ 'tn/ d^/ 
		/ /b  156:041537&*,0/3&*]
E,,-]
 !!2!23]
 E,,-	]

 !.]
 u//0]
 d^]
 $D>]
 'tn]
 d^]
 
uj 	!]
  ]
r*   rG  c                       e Zd ZU eed<   def fdZdej                  fdZd Z	e
e	 	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dee   deeef   fd              Z xZS )ClapTextModelWithProjectionr   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r{   )r|   r}   rb  r}  r  r~  rX  r  s     r(   r}   z$ClapTextModelWithProjection.__init__  s3     '/26:r*   rL   c                 B    | j                   j                  j                  S r{   r}  r  r  rw   s    r(   rZ  z0ClapTextModelWithProjection.get_input_embeddings  s    ))999r*   c                 :    || j                   j                  _        y r{   r  rj  s     r(   rk  z0ClapTextModelWithProjection.set_input_embeddings   s    5:""2r*   rE   r   r  r   r  r  c                    ||n| j                   j                  }| j                  |||||d      }|s|d   n|j                  }| j	                  |      }	t        |	|j                  |j                  |j                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```Tr  r   )rY   rZ   r"   r[   )	r   r]  r}  r  r~  rX   rZ   r"   r[   )
rt   rE   r   r  r   r  r  r  r<  rY   s
             r(   r   z#ClapTextModelWithProjection.forward#  s    2 &1%<k$++B]B])%/!5 ' 
 0;Q@Z@Z**=9"#*<<&44#..	
 	
r*   r  )r\   r]   r^   r   ra   r}   r   r  rZ  rk  r   r   r   rA   r  r  r   rb   rX   r   r   r   s   @r(   r  r    s    ~ :bii :;  -115/3,0/3&*+
ELL)+
 !.+
 u||,	+

 $D>+
 'tn+
 d^+
 
u))	*+
  +
r*   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
e	 	 	 	 	 ddeej                     deej                     dee   dee   d	ee   deeef   fd
              Z xZS )ClapAudioModelWithProjectionr   r  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r{   )r|   r}   rU  r  r  r  rX  r  s     r(   r}   z%ClapAudioModelWithProjection.__init__X  s4     )&1 3F ;r*   rL   c                 V    | j                   j                  j                  j                  S r{   )r  rW  r  r   rw   s    r(   rZ  z1ClapAudioModelWithProjection.get_input_embeddings_  s     --99>>>r*   r  r   r  r  c                 l   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||d      }|s|d   n|j
                  }| j                  |      }t        ||j                  |j                  |j                        S )av  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```Tr\  r   )rf   rZ   r[   r"   )r   r]  r   r  r  r  r  re   rZ   r[   r"   )	rt   r  r  r   r  r  r  r<  rf   s	            r(   r   z$ClapAudioModelWithProjection.forwardb  s    > &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 (()/!5 ) 
 1<a(A\A\,,];#%+==$//'55	
 	
r*   r^  )r\   r]   r^   r   ra   r_  r}   r   r  rZ  r   r   r   rA   r`   r`  r  r   rb   re   r   r   r   s   @r(   r  r  S  s    &O ?bii ?  7;04,0/3&*4
 !2!234
 E,,-4
 $D>	4

 'tn4
 d^4
 
u**	+4
  4
r*   r  )rG  r>  rb  r  rU  r  )r   )r   N)Ur_   r   r   dataclassesr   typingr   r   r   r   rA   torch.nn.functionalr   rR   r  activationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   r   utilsr   r   r   r   r   configuration_clapr   r   r   
get_loggerr\   loggerr)   r9   r;   rJ   r  rU   rX   re   rh   r  ry   r   r   r   r  r  r-  r9  r>  rw  r  r  r  r  floatr  r  r  r  r  r$  r(  r1  r7  r>  rU  rb  rG  r  r  __all__rc   r*   r(   <module>r     s      ! 1 1     ! 9 
 G v v V V K K 
		H	%"*(4$7U\\ 7ell 7
 	?+ 	? 	? 
	?; 	? 	?  
  
   
H299 2%		 %P_")) _F\RYY \@
")) 
# #NBII  	bii 	zRYY z|9/ 9z3BII 3lB
ryy B
J")) &V= V=B (,%II%<<% 
% <<	%
 U\\*% % % %%87BII 7v *		 *\299  RYY %. %R.
bii .
dRYY  =/ = =:8
( 8
v b
' b
b
J d
# d
 d
N =
"5 =
 =
@ D
#6 D
 D
Nr*   