
    rh                        d dl mZmZmZ d dlZd dlmZ d dlmZm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9  e.jt                  e;      Z< G d de      Z= G d dej|                        Z? G d dej|                        Z@ G d de0      ZA G d de1      ZB G d  d!e4      ZC G d" d#e      ZDe, G d$ d%e'             ZE G d& d'eE      ZF G d( d)e5      ZG G d* d+e8      ZH e,d,-       G d. d/eEe             ZIg d0ZJy)1    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc                   j     e Zd ZdZdZdgZddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 fd	Z xZS )
MoonshineConfiga"  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
            Percentage of the query and keys which will have rotary embedding.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesencoder_num_key_value_headsencoder_num_attention_headsencoder_num_hidden_layers)num_key_value_headsnum_attention_headsnum_hidden_layersc                    || _         || _        || _        || _        || _        || _        || _        ||}|| _        |	|}	|	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        t-        |        t/        | `  d||||d| y )N)bos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id )
vocab_sizehidden_sizeintermediate_sizer0   decoder_num_hidden_layersr/   decoder_num_attention_headsr.   decoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actdecoder_hidden_actmax_position_embeddingsinitializer_ranger8   	use_cache
rope_thetarope_scalingpartial_rotary_factorr7   attention_biasattention_dropoutr   super__init__)selfr:   r;   r<   r0   r=   r/   r>   r.   r?   r@   rA   rB   rC   rD   r8   rE   rF   rG   rH   r7   rI   rJ   r5   r6   kwargs	__class__s                             /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/moonshine/modular_moonshine.pyrL   zMoonshineConfig.__init__   s    8 %&!2)B&)B&+F(+F(&.*E'+F(&.*E'+F(+F("4"4'>$!2&<#"$(%:""4,!2 	t$ 	
%%1#9		

 	
    )i   i   i     rR      rS   NNNgelusilui   g{Gz?   Tg     @Ng?TF        rV   r!   )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_maprL   __classcell__rO   s   @rP   r+   r+   1   s    {z J#4"5<<8M "#"#$%$%$($($(!! # !3D
 D
rQ   r+   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y NrK   rL   configr	   activation_fnnnLinearr;   r<   fc1fc2rM   rf   
hidden_actrO   s      rP   rL   zMoonshineEncoderMLP.__init__   s^    #J/99V//1I1IJ99V55v7I7IJrQ   hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S rd   )rj   rg   rk   )rM   rn   s     rP   forwardzMoonshineEncoderMLP.forward  s4    /**=9/rQ   rX   rY   rZ   rL   torchTensorrq   r_   r`   s   @rP   rb   rb      s$    KU\\ ell rQ   rb   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )Nr!   re   rl   s      rP   rL   zMoonshineDecoderMLP.__init__  sc    #J/99V//1I1IA1MN99V55v7I7IJrQ   rn   ro   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )Nr!   )dim)rj   chunkrg   rk   )rM   rn   gates      rP   rq   zMoonshineDecoderMLP.forward  sS    /+11!1<t**40=@/rQ   rr   r`   s   @rP   rv   rv     s$    KU\\ ell rQ   rv   c                   h    e Zd Zdededededef
 fdZ	 	 	 	 	 ddej                  de	e
ej                  ej                  f      d	e	ej                     d
e	e   de	ej                     de	ej                     dee   de
ej                  e	ej                     e	e
ej                        f   fdZ xZS )MoonshineAttentionrf   	layer_idx	is_causalr2   r1   c                 n   |j                  ||d       t        | 	  ||       || _        t	        |d|j
                  |j                  z        | _        | j                  j                  C| j                  j                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _
        y d| _
        y )N)r2   r1   head_dimrV   r   )updaterK   rL   r   getattrr;   r2   r   rf   r@   head_dim_padding)	rM   rf   r   r   r2   r1   target_multipletarget_head_dimrO   s	           rP   rL   zMoonshineAttention.__init__  s     	.AZmno+"
F4F4F&JdJd4de ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!rQ   rn   position_embeddingsattention_maskpast_key_valuecache_positionkey_value_statesrN   ro   c                 j   |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j#                  ||| j                  d|i      \  }}|s?|\  }}t%        |
|||      \  }
}|'|||d}|j#                  ||| j                  |      \  }}t&        }| j                  j(                  dk7  rt*        | j                  j(                     }| j,                  xr |d u xr |	dkD  }| j.                  dkD  rt0        j2                  j4                  j7                  |
d| j.                  f      }
t0        j2                  j4                  j7                  |d| j.                  f      }t0        j2                  j4                  j7                  |d| j.                  f      } || |
|||f| j8                  sd	n| j:                  | j<                  |d
|\  }}| j.                  dkD  r|dd | j.                   f   }|j?                  ||	d      jA                         }| jC                  |      }||fS )Nry   rV   r!   Tr   )sincosr   eagerr   rW   )dropoutscalingr   .)"shapeq_projviewrf   r1   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r$   r'   _attn_implementationr   r   r   rs   rh   
functionalpadtrainingrJ   r   reshape
contiguouso_proj)rM   rn   r   r   r   r   r   rN   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statesr   r   cache_kwargsattention_interfacer   attn_outputattn_weightss                          rP   rq   zMoonshineAttention.forward3  s    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9%'2266t~~FJ!<@))$..9!/!E!E!/!D!D .>-I)}.Z'..t~~>CCJ)00@GGL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "n&@+9+@+@dnn?OQ_>`,(
L "*HC';L*VY[^'_$L*)'*3.Y+9+@+@dnnl,(
L )@;;++w6"9$++:Z:Z"[NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((rQ   )NNNNN)rX   rY   rZ   r+   intboolrL   rs   rt   r   tupler
   
LongTensorr   r   rq   r_   r`   s   @rP   r~   r~     s   && & 	&
 !& !&0 LP15*.5937U)||U) &eELL%,,,F&GHU) !.	U)
 !U) !!1!12U) #5<<0U) -.U) 
u||Xell3XeELL>Q5RR	SU)rQ   r~   c                       e Zd Zy)MoonshineRotaryEmbeddingN)rX   rY   rZ   r9   rQ   rP   r   r     s    rQ   r   c                   (     e Zd Zdedef fdZ xZS )MoonshineEncoderLayerrf   r   c                 F   t         |   ||       t        ||d|j                  |j                        | _        t        ||j                        | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFrf   r   r   r2   r1   bias)rK   rL   r~   r/   r.   	self_attnrb   rA   mlprh   	LayerNormr;   input_layernormpost_attention_layernormrM   rf   r   rO   s      rP   rL   zMoonshineEncoderLayer.__init__  s    ++ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%rQ   )rX   rY   rZ   r+   r   rL   r_   r`   s   @rP   r   r     s    U U3 U UrQ   r   c                    
    e Zd Zddedee   f fdZ	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee   deej                     deeej                  ej                  f      deeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )MoonshineDecoderLayerrf   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )rK   rL   r;   r~   r>   r?   r   encoder_attnrv   rB   r   rh   r   r   r   final_layernormr   s      rP   rL   zMoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKrQ   rn   r   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr   rE   r   r   encoder_position_embeddingsrN   ro   c                 (   |}| j                  |      } | j                  d||||||	|
d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )N)rn   r   r   r   rE   r   r   )rn   r   r   r   rE   r9   )r   r   r   r   r   r   )rM   rn   r   r   r   r   r   r   rE   r   r   r   rN   residual_s                  rP   rq   zMoonshineDecoderLayer.forward  s     !,,];)4>> 	
')%)) 3	
 	
q !=0 ,$H 99-HM#00+!65-#  1  M1 %}4M ,,];/ =0rQ   rd   )
NNNNNNFNNN)rX   rY   rZ   r+   r   r   rL   rs   rt   r   r
   r   r   r   r   FloatTensorrq   r_   r`   s   @rP   r   r     si   L L8C= L6 268<9=37;?*.$)59KOSW.||. !..  (5	.
 !) 6. u//0. 'u'7'78. !. D>. !!1!12. &eELL%,,,F&GH. &.eELL%,,4N.O%P. +,. 
u  (51B1BEDUDU1U+V"WW	X.rQ   r   c                   X    e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdej                  fdZy	)
MoonshinePreTrainedModelrf   modelinput_valuesTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   rV      r   r!   )r   )rM   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        rP    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""rQ   N)rX   rY   rZ   r+   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrs   r   r   r9   rQ   rP   r   r     sH    $O&*#02IJN!#e>N>N #rQ   r   c            
            e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Ze	 ddej                   d
eej$                     dee   defd       Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsrn   rf   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t        |      | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t	        j$                  |d      | _        d| _        | j+                          y c c}w )NrV   r   r   F)kernel_sizestrider   r!   r   r   )r   r   gh㈵>)
num_groupsnum_channelseps)rf   r   )rK   rL   rf   r;   rh   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListranger0   r   r   r   
layer_normgradient_checkpointing	post_init)rM   rf   	embed_dimidxrO   s       rP   rL   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bcC"63/c
 ,,yu=&+#	 ds   D,ro   c                     | j                   S rd   r   rM   s    rP   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings"  s    zzrQ   valuec                     || _         y rd   r   )rM   r  s     rP   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings%  s	    
rQ   r   rN   c                    |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }|| j                  |j                  d         }d}|ddd|f   dd|f   }| j                  j                  dk(  r|d	k(  j                         r|nd}nF| j                  j                  d
k(  rt        ||j                         }nt#        ||j                         }t%        j&                  d|j                  d   |j(                        j                  d      }| j+                  ||      }| j,                  D ]  }	 |	|f|||d|} | j/                  |      }t1        |      S )a-  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        rV   r   r!   Nry     .flash_attention_2rW   sdpadevice)r   r   r   )last_hidden_state)	unsqueezerh   r   tanhr   r   rT   r   r   permuter   r   rf   r   anyr   dtyper   rs   aranger
  r   r   r   r   )
rM   r   r   rN   rn   mask_lendownsample_strider   r   encoder_layers
             rP   rq   zMoonshineEncoder.forward(  s   , $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3PVZ11V;!D^UbUhUh!i!;NML_L_!`||A}':':1'=mFZFZ[eefgh"oom\J![[ 	M)-)$7	
 M	 6&+
 	
rQ   rd   )rX   rY   rZ   r[   r   r~   r   _can_record_outputsr+   rL   rh   Moduler  r  r   rs   r   r   rt   r   r   r   rq   r_   r`   s   @rP   r   r     s     %O(.
 $bii "))   268
''8
 !.8
 +,	8

 
!8
 8
rQ   r   c                   |    e Zd ZdZ eedd      e eedd      dZdef fdZ	e
	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   deej                      dee   deej                     deej                      deej                     dee   deeef   fd       Z xZS )MoonshineDecoder	input_idsrV   r   )index
layer_namer   )r   rn   cross_attentionsrf   c           	         t         |   |       t        j                  |j                  d      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        y c c}w NFr   )rK   rL   rh   r   r;   normr   r   r=   r   r   )rM   rf   r   rO   s      rP   rL   zMoonshineDecoder.__init__l  s\     LL!3!3%@	mm;@AaAa;bcC"63/c
cs   A=r   r   r-   inputs_embedsrE   r   r   r   rN   ro   c
                    |du |duz  rt        d      || j                  |      }|r"| t               }t               }t        ||      }|F||j	                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }|	|j                  d   }d}|	d	dd|f   d	d|f   }	| j                  j                  d
k(  r|	dk(  j                         r|	nd}	nb| j                  j                  dk(  r%t        |	|j                   |j                  d         }	n$t#        |	|j                   |j                  d         }	| j$                  D ]  } ||||f|	|||||d|
} | j'                  |      }t)        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   rV   r	  )rf   input_embedsr   r   r-   r   r  .r  rW   r  )r   r   r   rE   r   r   )r  r-   )
ValueErrorembed_tokensr   r   get_seq_lengthrs   r  r   r
  r  r   rf   r   r   r  r   r  r   r   r  r   )rM   r  r   r   r-   r   rE   r   r   r   rN   r   r   past_seen_tokenscausal_maskrn   r   r  r  decoder_layers                       rP   rq   zMoonshineDecoder.forwards  sE   0 -t";<YZZ  --i8M0#/> $0N!12FH]^O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oom\J!-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfCh)?nr&11V;)L*M,?,?ATATUWAX*& *D*M,?,?ATATUWAX*& "[[ 	M)% (>).#-$7 M	 		-08+/8O
 	
>B
 	
rQ   )	NNNNNNNNN)rX   rY   rZ   r   r   r~   r   r  r+   rL   r   r   rs   r   rt   r
   r   r   r   r   r   r   r   rq   r_   r`   s   @rP   r  r  d  sF   !O$%7q[Y.*+=QSab
 
  151537+/59$(59=A9=Y
E,,-Y
 !.Y
 u//0	Y

 "%Y
   1 12Y
 D>Y
 !!1!12Y
  ((9(9:Y
 !) 6Y
 +,Y
 
u--	.Y
 Y
rQ   r  c                      e Zd Zee	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     dee	e	ej                           dee
ee	ej                     f      dee	ej                        d	ee	ej                        d
ee   deej                     dee   defd              Zy)MoonshineModelNr   r   decoder_input_idsdecoder_attention_maskencoder_outputsr-   decoder_inputs_embedsdecoder_position_idsrE   r   rN   ro   c                 B   | | j                   |fd|i|} | j                  d||||j                  ||||	|
d	|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )a	  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        r   )	r  r   r   r   r-   r   r   rE   r   )r  r-   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsr9   )encoderdecoderr  r   r-   rn   r   r  )rM   r   r   r,  r-  r.  r-   r/  r0  rE   r   rN   decoder_outputss                rP   rq   zMoonshineModel.forward  s    \ "/;t||L/rYg/rkq/rOEQT\\ F
'1#1"1"C"C+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
rQ   )
NNNNNNNNNN)rX   rY   rZ   r   r   r   rs   r   r   r   r   r   r   r   r   r   rq   r9   rQ   rP   r+  r+    sQ    59598<=AEIZ^DHBF$(59E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 "%e.?.?(@"ABE
 "%(;U5CTCT=U(U"VWE
  (e.?.?(@AE
 'uU-=-='>?E
 D>E
 !!1!12E
 +,E
 
E
  E
rQ   r+  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       e Zd ZdgZdef fdZd Zd Zd Zd Z	de
j                  fd	Zee	 	 	 	 	 	 	 	 	 	 	 dd
eej"                     deej$                     deej$                     deej$                     deeeej"                           deeeeej"                     f      deeej"                        deeej$                        dee   deej$                     deej$                     dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightrf   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r  )
rK   rL   r+  r   rh   ri   r;   r:   proj_outr   )rM   rf   rO   s     rP   rL   z*MoonshineForConditionalGeneration.__init__#  sH     #F+
		&"4"4f6G6GeT 	rQ   c                 6    | j                   j                         S rd   )r   get_encoderr   s    rP   r?  z-MoonshineForConditionalGeneration.get_encoder+      zz%%''rQ   c                 6    | j                   j                         S rd   )r   get_decoderr   s    rP   rB  z-MoonshineForConditionalGeneration.get_decoder.  r@  rQ   c                     | j                   S rd   r=  r   s    rP   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings1  s    }}rQ   c                     || _         y rd   rD  )rM   new_embeddingss     rP   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings4  s	    &rQ   ro   c                 6    | j                   j                         S rd   )r   r  r   s    rP   r  z6MoonshineForConditionalGeneration.get_input_embeddings7  s    zz..00rQ   r   r   r,  r-  r.  r-   r/  r0  rE   r   labelsrN   c                    |9|7|5t        || j                  j                  | j                  j                        } | j                  |f||||||||	|
d	|}| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a/  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r   r,  r.  r-  r-   r/  r0  rE   r   )logitsrJ  r:   )	lossrL  r-   r2  r3  r  r4  r   r5  )r)   rf   pad_token_idr8   r   r=  r  loss_functionr:   r   r-   r2  r3  r  r4  r   r5  )rM   r   r   r,  r-  r.  r-   r/  r0  rE   r   rJ  rN   outputsrL  rM  s                   rP   rq   z)MoonshineForConditionalGeneration.forward:  s   f  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5)'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
rQ   )NNNNNNNNNNN)rX   rY   rZ   _tied_weights_keysr+   rL   r?  rB  rE  rH  rh   r  r  r   r   r   rs   r   r   r   r   r   r   r   r   r   rq   r_   r`   s   @rP   r;  r;    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(59-1T
u001T
 !!1!12T
 $E$4$45	T

 !))9)9 :T
 "%e.?.?(@"ABT
 "%(;U5CTCT=U(U"VWT
  (e.?.?(@AT
 'uU-=-='>?T
 D>T
 !!1!12T
 ))*T
 +,T
 
T
  T
rQ   r;  )r+   r+  r   r;  )Ktypingr   r   r   rs   torch.nnrh   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   configuration_utilsr   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    glm.modeling_glmr"   r#   r$   llama.modeling_llamar%   r&   r'   whisper.modeling_whisperr(   r)   
get_loggerrX   loggerr+   r  rb   rv   r~   r   r   r   r   r   r  r+  r;  __all__r9   rQ   rP   <module>rh     sV   - ,   I ! C C 3 ) / g B 9  : F & R R U U Y Y G 
		H	%J
& J
Z")) "))  k) k)\	1 	U- U"G6 GT # # #._
/ _
Di
z i
XH
\ H
V 
p
(@/ p

p
frQ   