
    rh                        d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0  G d dejb                        Z2 G d dejb                        Z3dejh                  de5dejh                  fdZ6	 dDdejb                  dejh                  dejh                  dejh                  deejh                     d e7d!e7d"e*e,   fd#Z8d$ Z9dEd%Z: G d& d'ejb                        Z; G d( d)ejb                        Z< G d* d+e      Z= G d, d-e      Z>e- G d. d/e(             Z? G d0 d1e?      Z@e- G d2 d3e?             ZA	 	 dFd4eBe5e5f   d5e7d6e5deej                     d7e5dej                  fd8ZEe- G d9 d:e?             ZFd;ejh                  d<e5d=e5fd>ZG e-d?@       G dA dBe?e             ZHg dCZIy)G    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple   )MoonshineConfigc                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y Nsuper__init__configr	   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr)   
hidden_act	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/moonshine/modeling_moonshine.pyr(   zMoonshineEncoderMLP.__init__3   s^    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r%   )r/   r*   r0   )r2   r7   s     r5   forwardzMoonshineEncoderMLP.forward:   s4    /**=9/r6   __name__
__module____qualname__r(   torchTensorr:   __classcell__r4   s   @r5   r#   r#   2   s$    KU\\ ell r6   r#   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )N   r&   r1   s      r5   r(   zMoonshineDecoderMLP.__init__B   sc    #J/99V//1I1IA1MN99V55v7I7IJr6   r7   r8   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )NrF   dim)r/   chunkr*   r0   )r2   r7   gates      r5   r:   zMoonshineDecoderMLP.forwardI   sS    /+11!1<t**40=@/r6   r;   rB   s   @r5   rD   rD   A   s$    KU\\ ell r6   rD   r7   n_repr8   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)shapeexpandreshape)r7   rM   batchnum_key_value_headsslenhead_dims         r5   	repeat_kvrV   Q   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr6   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrF   r   rH   )rJ   dtype)ptrainingr    )rV   num_key_value_groupsr?   matmul	transposerO   r+   
functionalsoftmaxfloat32tora   r]   rc   
contiguous)rW   rX   rY   rZ   r[   r\   r]   r^   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r5   eager_attention_forwardrq   ]   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r6   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   NrF   r    rH   rI   r`   )r?   stackflatten)xx1x2s      r5   rotate_halfrx   w   sJ    	
319B	
319B;;Ryb)11"55r6   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t	        j
                  ||gd      }t	        j
                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrH   rF   rI   )	unsqueezerO   repeat_interleaverx   r?   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r5   apply_rotary_pos_embr   ~   sD   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr6   c                   l    e Zd ZdZdededededef
 fdZ	 	 	 	 	 ddej                  d	e
eej                  ej                  f      d
e
ej                     de
e   de
ej                     de
ej                     dee   deej                  e
ej                     e
eej                        f   fdZ xZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr)   	layer_idx	is_causalnum_attention_headsrS   c                 8   t         |           |j                  ||d       || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        || _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  | j                  z  |j                  d      | _        | j                  j*                  C| j                  j*                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _        y d| _        y )N)r   rS   rU   g      ࿩biasFr    r   )r'   r(   updater)   r   getattrr-   r   rU   rS   rd   r\   attention_dropoutr   r+   r,   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r2   r)   r   r   r   rS   target_multipletarget_head_dimr4   s	           r5   r(   zMoonshineAttention.__init__   s    	.AZmno"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!r6   r7   position_embeddingsr[   past_key_valuecache_positionkey_value_statesr^   r8   c                 j   |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j#                  ||| j                  d|i      \  }}|s?|\  }}t%        |
|||      \  }
}|'|||d}|j#                  ||| j                  |      \  }}t&        }| j                  j(                  dk7  rt*        | j                  j(                     }| j,                  xr |d u xr |	dkD  }| j.                  dkD  rt0        j2                  j4                  j7                  |
d| j.                  f      }
t0        j2                  j4                  j7                  |d| j.                  f      }t0        j2                  j4                  j7                  |d| j.                  f      } || |
|||f| j8                  sd	n| j:                  | j<                  |d
|\  }}| j.                  dkD  r|dd | j.                   f   }|j?                  ||	d      jA                         }| jC                  |      }||fS )NrH   r    rF   Tr   )r   r   r   eagerr           )r]   r\   r   .)"rO   r   viewr)   rS   rU   rf   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   rq   _attn_implementationr   r   r   r?   r+   rg   padrc   r   r\   rQ   rk   r   )r2   r7   r   r[   r   r   r   r^   bszq_lenquery_statesis_cross_attentionr   current_statesrl   rm   r   r   cache_kwargsattention_interfacer   rp   rn   s                          r5   r:   zMoonshineAttention.forward   s    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9%'2266t~~FJ!<@))$..9!/!E!E!/!D!D .>-I)}.Z'..t~~>CCJ)00@GGL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "n&@+9+@+@dnn?OQ_>`,(
L "*HC';L*VY[^'_$L*)'*3.Y+9+@+@dnnl,(
L )@;;++w6"9$++:Z:Z"[NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((r6   )NNNNN)r<   r=   r>   __doc__r!   intboolr(   r?   r@   r   tupler
   
LongTensorr   r   r:   rA   rB   s   @r5   r   r      s   G#&#& #& 	#&
 !#& !#&P LP15*.5937U)||U) &eELL%,,,F&GHU) !.	U)
 !U) !!1!12U) #5<<0U) -.U) 
u||Xell3XeELL>Q5RR	SU)r6   r   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )MoonshineRotaryEmbeddingr)   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r'   r(   hasattr
isinstancer   dictr   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr)   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r2   r)   devicer   r4   s       r5   r(   z!MoonshineRotaryEmbedding.__init__)  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r6   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rH   r    mpscpuF)device_typeenabledrF   rI   ra   )r   floatrP   rO   rj   r   r   r   strr?   autocastrf   r|   r   r   r   ra   )
r2   ru   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r5   r:   z MoonshineRotaryEmbedding.forward:  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r%   )
r<   r=   r>   r!   r(   r?   no_gradr   r:   rA   rB   s   @r5   r   r   (  s3    / /" U]]_<  <r6   r   c                   (    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	eej                     d
eeej                  ej                  f      dee   deej                     fdZ xZS )MoonshineEncoderLayerr)   r   c                 d   t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||j                        | _	        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFr)   r   r   r   rS   r   )r'   r(   r-   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr#   encoder_hidden_actmlpr+   	LayerNorminput_layernormpost_attention_layernormr2   r)   r   r4   s      r5   r(   zMoonshineEncoderLayer.__init__K  s    !--+ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%r6   r7   r[   r   r   	use_cacher   r   r^   r8   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )Nr7   r[   r   r   r   r   r    )r   r   r   r   )r2   r7   r[   r   r   r   r   r   r^   residual_s              r5   r:   zMoonshineEncoderLayer.forward[  s     !,,];)4>> 	
')%)) 3	
 	
q !=0 !55mD/ =0r6   )NNNFNN)r<   r=   r>   r!   r   r(   r?   r@   r   r   r
   r   r   r   r   r:   rA   rB   s   @r5   r   r   J  s    U U3 U& 2637*.$)59KO|| !. u//0	
 ! D> !!1!12 &eELL%,,,F&GH +, 
u||	r6   r   c                    
    e Zd Zddedee   f fdZ	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee   deej                     deeej                  ej                  f      deeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )MoonshineDecoderLayerr)   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )r'   r(   r-   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrD   decoder_hidden_actr   r+   r   r   r   final_layernormr   s      r5   r(   zMoonshineDecoderLayer.__init__~  s    !--+ & B B & B B
 / & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr6   r7   r[   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   r   encoder_position_embeddingsr^   r8   c                 (   |}| j                  |      } | j                  d||||||	|
d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )Nr   )r7   r   r[   r   r   r   )r   r   r   r   r   r   )r2   r7   r[   r   r   r   r   r   r   r   r   r   r^   r   r   s                  r5   r:   zMoonshineDecoderLayer.forward  s     !,,];)4>> 	
')%)) 3	
 	
q !=0 ,$H 99-HM#00+!65-#  1  M1 %}4M ,,];/ =0r6   r%   )
NNNNNNFNNN)r<   r=   r>   r!   r   r   r(   r?   r@   r   r
   r   r   r   r   FloatTensorr:   rA   rB   s   @r5   r   r   }  si   L L8C= L6 268<9=37;?*.$)59KOSW.||. !..  (5	.
 !) 6. u//0. 'u'7'78. !. D>. !!1!12. &eELL%,,,F&GH. &.eELL%,,4N.O%P. +,. 
u  (51B1BEDUDU1U+V"WW	X.r6   r   c                   X    e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdej                  fdZy	)
MoonshinePreTrainedModelr)   modelinput_valuesTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   r       r   rF   )r   )r2   r  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r5    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r6   N)r<   r=   r>   r!   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr?   r   r  r   r6   r5   r   r     sH    $O&*#02IJN!#e>N>N #r6   r   c            
            e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Ze	 ddej                   d
eej$                     dee   defd       Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r  )
attentionsr7   r)   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t        |      | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t	        j$                  |d      | _        d| _        | j+                          y c c}w )Nr    r  r  F)kernel_sizestrider   rF   r  r   )r  r  gh㈵>)
num_groupsnum_channelsepsr)   r   )r'   r(   r)   r-   r+   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r2   r)   	embed_dimidxr4   s       r5   r(   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bcC"63/c
 ,,yu=&+#	 ds   D,r8   c                     | j                   S r%   r  r2   s    r5   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings  s    zzr6   rZ   c                     || _         y r%   r.  r2   rZ   s     r5   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings  s	    
r6   r[   r^   c                    |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }|| j                  |j                  d         }d}|ddd|f   dd|f   }| j                  j                  dk(  r|d	k(  j                         r|nd}nF| j                  j                  d
k(  rt        ||j                         }nt#        ||j                         }t%        j&                  d|j                  d   |j(                        j                  d      }| j+                  ||      }| j,                  D ]  }	 |	|f|||d|} | j/                  |      }t1        |      S )a-  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r    r   rF   NrH     .flash_attention_2r   sdpar   )r[   r   r   )last_hidden_state)rz   r+   rg   tanhr  r#  gelur   r!  permuter  rO   r)   r   anyr   ra   r   r?   aranger   r$  r   r(  r   )
r2   r  r[   r^   r7   mask_lendownsample_strider   r   encoder_layers
             r5   r:   zMoonshineEncoder.forward  s   , $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3PVZ11V;!D^UbUhUh!i!;NML_L_!`||A}':':1'=mFZFZ[eefgh"oom\J![[ 	M)-)$7	
 M	 6&+
 	
r6   r%   )r<   r=   r>   r   r  r   r   _can_record_outputsr!   r(   r+   Moduler0  r3  r   r?   r   r   r@   r   r   r   r:   rA   rB   s   @r5   r  r    s     %O(.
 $bii "))   268
''8
 !.8
 +,	8

 
!8
 8
r6   r  c                   |    e Zd ZdZ eedd      e eedd      dZdef fdZ	e
	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   deej                      dee   deej                     deej                      deej                     dee   deeef   fd       Z xZS )MoonshineDecoder	input_idsr    r   )index
layer_namer   )r  r7   cross_attentionsr)   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  d      | _        t!        |      | _        d| _        | j'                          y c c}w )NFr   r  )r'   r(   pad_token_idpadding_idx
vocab_sizer+   	Embeddingr-   embed_tokensr%  r&  decoder_num_hidden_layersr   r   r   normr   r$  r)  r*  )r2   r)   r,  r4   s      r5   r(   zMoonshineDecoder.__init__J  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bcC"63/c
 LL!3!3%@	2&A&+# 	 ds   Dr[   r   past_key_valuesinputs_embedsr   r   r   r   r^   r8   c
                    |du |duz  rt        d      || j                  |      }|r"| t               }t               }t        ||      }|F||j	                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }|	|j                  d   }d}|	d	dd|f   d	d|f   }	| j                  j                  d
k(  r|	dk(  j                         r|	nd}	nb| j                  j                  dk(  r%t        |	|j                   |j                  d         }	n$t#        |	|j                   |j                  d         }	| j$                  D ]  } ||||f|	|||||d|
} | j'                  |      }t)        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r    r8  )r)   input_embedsr[   r   rR  r   r`   r5  .r6  r   r7  )r   r   r   r   r   r   )r9  rR  )
ValueErrorrO  r   r   get_seq_lengthr?   r>  rO   r   rz   r   r)   r$  r   r=  r   ra   r   r   rQ  r   )r2   rF  r[   r   rR  rS  r   r   r   r   r^   r   r   past_seen_tokensro   r7   r   r?  r@  decoder_layers                       r5   r:   zMoonshineDecoder.forwardZ  sE   0 -t";<YZZ  --i8M0#/> $0N!12FH]^O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oom\J!-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfCh)?nr&11V;)L*M,?,?ATATUWAX*& *D*M,?,?ATATUWAX*& "[[ 	M)% (>).#-$7 M	 		-08+/8O
 	
>B
 	
r6   )	NNNNNNNNN)r<   r=   r>   r  r   r   r   rB  r!   r(   r   r   r?   r   r@   r
   r   r   r   r   r   r   r   r:   rA   rB   s   @r5   rE  rE  A  sF   !O$%7q[Y.*+=QSab    151537+/59$(59=A9=Y
E,,-Y
 !.Y
 u//0	Y

 "%Y
   1 12Y
 D>Y
 !!1!12Y
  ((9(9:Y
 !) 6Y
 +,Y
 
u--	.Y
 Y
r6   rE  rO   	mask_probmask_length	min_masksc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r    z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr    r   )r   max)input_lengthnum_masked_spanepsilonr[  rZ  r\  sequence_lengths     r5   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr6   NrH   r   r   F)replace)rV  nprandomranditemdetachsumtolistr&  zerosr   choicer>  lenconcatenateonesint32appendarraybroadcast_torQ   r`  put_along_axis)rO   rZ  r[  r[   r\  
batch_sizere  r   r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanra  rb  spec_aug_mask_idxdummy_mask_idxoffsetsrc  rd  s    `` `            @@r5   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                       e Zd Zdef fdZd Zd Zd Zd Zd Z		 dde
j                  d	ee
j                     fd
Zee	 	 	 	 	 	 	 	 	 	 ddee
j                     d	ee
j                     dee
j                     dee
j                     deeee
j                           deeeee
j                     f      deee
j                        deee
j                        dee   dee
j                     dee   defd              Z xZS )MoonshineModelr)   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r%   )r'   r(   r  encoderrE  decoderr*  r2   r)   r4   s     r5   r(   zMoonshineModel.__init__0  s2     '/'/r6   c                 .    | j                   j                  S r%   r  rO  r/  s    r5   r0  z#MoonshineModel.get_input_embeddings8  s    ||(((r6   c                 &    || j                   _        y r%   r  r2  s     r5   r3  z#MoonshineModel.set_input_embeddings;  s    $)!r6   c                     | j                   S r%   )r  r/  s    r5   get_encoderzMoonshineModel.get_encoder>      ||r6   c                     | j                   S r%   )r  r/  s    r5   get_decoderzMoonshineModel.get_decoderA  r  r6   c                 8    | j                   j                          y)z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)r  _freeze_parametersr/  s    r5   freeze_encoderzMoonshineModel.freeze_encoderD  s    
 	'')r6   input_featuresr[   c                 2   t        | j                  dd      s|S |j                         \  }}}| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }|dddf   j                  d|d      }d||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  | j                  j                        }t        j                  ||j                  t        j                        }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTr   )rZ  r[  r[   r\  )r   ra   NrH   )rZ  r[  r\  )r   r)   sizemask_time_probrc   r  mask_time_lengthmask_time_min_masksr?   tensorr   r   rP   mask_feature_probmask_feature_lengthmask_feature_min_masks)r2   r  r[   rx  r-   rd  mask_time_indicesmask_feature_indicess           r5   _mask_input_featuresz#MoonshineModel._mask_input_featuresK  s[    t{{$8$?!! 4B3F3F3H0
K;;%%)dmm 5_-++44 KK88-++99! !&->~G\G\didndn o 1!T' : A A"kSU V01N,-;;((1,#8[)++77 KK;;++<<	$  $)<<0D^MbMbjojtjt#u 34N/0r6   r  decoder_input_idsdecoder_attention_maskencoder_outputsrR  decoder_inputs_embedsdecoder_position_idsr   r   r^   r8   c                 B   | | j                   |fd|i|} | j                  d||||j                  ||||	|
d	|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )a	  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        r[   )	rF  r[   r   r   rR  rS  r   r   r   )r9  rR  decoder_hidden_statesdecoder_attentionsrI  encoder_last_hidden_stater   encoder_attentionsr   )r  r  r9  r   rR  r7   r  rI  )r2   r  r[   r  r  r  rR  r  r  r   r   r^   decoder_outputss                r5   r:   zMoonshineModel.forwardv  s    \ "/;t||L/rYg/rkq/rOEQT\\ F
'1#1"1"C"C+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r6   r%   )
NNNNNNNNNN)r<   r=   r>   r!   r(   r0  r3  r  r  r  r?   r   r   r   r  r   r   r   r   r   r   r   r   r   r:   rA   rB   s   @r5   r  r  .  s    )** 6:)))) !!1!12)V  59598<=AEIZ^DHBF$(59E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 "%e.?.?(@"ABE
 "%(;U5CTCT=U(U"VWE
  (e.?.?(@AE
 'uU-=-='>?E
 D>E
 !!1!12E
 +,E
 
E
  E
r6   r  rF  rK  decoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    NrH   r    r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrO   clonerV  masked_fill_)rF  rK  r  shifted_input_idss       r5   shift_tokens_rightr    s}     "++IOO<(CRC0668ae4adLMM""#4#<lKr6   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       e Zd ZdgZdef fdZd Zd Zd Zd Z	de
j                  fd	Zee	 	 	 	 	 	 	 	 	 	 	 dd
eej"                     deej$                     deej$                     deej$                     deeeej"                           deeeeej"                     f      deeej"                        deeej$                        dee   deej$                     deej$                     dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightr)   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r'   r(   r  r  r+   r,   r-   rM  proj_outr*  r  s     r5   r(   z*MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r6   c                 6    | j                   j                         S r%   )r  r  r/  s    r5   r  z-MoonshineForConditionalGeneration.get_encoder      zz%%''r6   c                 6    | j                   j                         S r%   )r  r  r/  s    r5   r  z-MoonshineForConditionalGeneration.get_decoder  r  r6   c                     | j                   S r%   r  r/  s    r5   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r6   c                     || _         y r%   r  )r2   new_embeddingss     r5   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings  s	    &r6   r8   c                 6    | j                   j                         S r%   )r  r0  r/  s    r5   r0  z6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r6   r  r[   r  r  r  rR  r  r  r   r   labelsr^   c                    |9|7|5t        || j                  j                  | j                  j                        } | j                  |f||||||||	|
d	|}| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a/  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r[   r  r  r  rR  r  r  r   r   )logitsr  rM  )	lossr  rR  r  r  rI  r  r   r  )r  r)   rK  r  r  r  r9  loss_functionrM  r   rR  r  r  rI  r  r   r  )r2   r  r[   r  r  r  rR  r  r  r   r   r  r^   outputsr  r  s                   r5   r:   z)MoonshineForConditionalGeneration.forward  s   f  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5)'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r6   )NNNNNNNNNNN)r<   r=   r>   _tied_weights_keysr!   r(   r  r  r  r  r+   rC  r0  r   r   r   r?   r   r   r   r   r   r   r   r   r   r:   rA   rB   s   @r5   r  r    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(59-1T
u001T
 !!1!12T
 $E$4$45	T

 !))9)9 :T
 "%e.?.?(@"ABT
 "%(;U5CTCT=U(U"VWT
  (e.?.?(@AT
 'uU-=-='>?T
 D>T
 !!1!12T
 ))*T
 +,T
 
T
  T
r6   r  )r  r   r  )r   )Nr    )Nr   )Jtypingr   r   r   numpyrg  r?   torch.nnr+   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_moonshiner!   rC  r#   rD   r@   r   rV   r   rq   rx   r   r   r   r   r   r   r  rE  r   r   ndarrayr  r  r  r  __all__r   r6   r5   <module>r     sz  * - ,    I ! C C ) / g B 9  L F & I I 4")) "))  	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%46'T}) })@<ryy <D06 0fG6 GT # # #._
/ _
D r
/ r
 r
r 26tc?tt t U--.	t
 t ZZtn N
- N
 N
b%,, c [^   
p
(@/ p

p
f ^r6   