
    rh                       d dl Z d dlZd dlmZmZ d dlmZmZmZ d dl	Z	d dl
mZ d dlmc mZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB  e*j                  eD      ZE G d de.e      ZF G d de      ZG G d deB      ZH G d de      ZI G d de@      ZJ G d  d!e=      ZK G d" d#e9      ZL G d$ d%ej                        ZN G d& d'ej                        ZO G d( d)ej                        ZP G d* d+ej                        ZQ G d, d-ej                        ZR G d. d/ej                        ZS G d0 d1ej                        ZT G d2 d3ej                        ZU G d4 d5ej                        ZV G d6 d7e#      ZW G d8 d9e;      ZX G d: d;ej                        ZY G d< d=e0      ZZ G d> d?ej                        Z[ G d@ dAe2      Z\	 	 d^dBe	j                  dCe	j                  dDe	j                  dEee	j                     dFe^f
dGZ_ G dH dIe6      Z` G dJ dKe7      Za G dL dMe1      Zb e(dNO       G dP dQe:             Zc e(dRO       G dS dTe8             Zd G dU dVej                        Ze e(dWO       G dX dYe?             Zf e(dZO       G d[ d\e>             Zgg d]Zhy)_    N)CallableSequence)AnyOptionalUnion   )ACT2FN)CacheDynamicCacheSlidingWindowLayer)PretrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging   )	AutoModel)Gemma2Config)	Gemma2MLPGemma2PreTrainedModelGemma2RotaryEmbeddingeager_attention_forwardrotate_half)Gemma3AttentionGemma3DecoderLayerGemma3ForCausalLMGemma3RMSNormGemma3TextModelGemma3TextScaledWordEmbedding)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)TimmWrapperConfigc            E           e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%dededededeeee   f   d	ed
ededededede	de	de
dededede	deeeef      de	de
de	dedeee      de	dede	de
d ed!ed"ed#eee	ee	   f      f@d$Zy)&Gemma3nTextConfiga\)  
    This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an
    Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.
    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).

    Configuration objects that inherit from [`Gemma3nTextConfig`] and can be used to control the model outputs. Read
    the documentation from [`Gemma3nTextConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262400):
            Vocabulary size of the Gemma3nText model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Gemma3nTextModel`]
        vocab_size_per_layer_input (`int`, *optional*, defaults to 262144):
            Vocabulary size of the per-layer text embeddings that augment the standard embeddings.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        hidden_size_per_layer_input (`int`, *optional*, defaults to 256):
            Dimension of the hidden representations for per-layer emebeddings.
        intermediate_size (`int` or `Sequence[int]`, *optional*, defaults to 16384):
            Dimension of the MLP representations. MatFormer configurations may wish to provide a sequence of integers
            to account for vairable intermediate_size values across layers. In such cases,
            `len(intermediate_size) == num_hidden_layers`.
        num_hidden_layers (`int`, *optional*, defaults to 35):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout this
            [paper](https://arxiv.org/pdf/2305.13245.pdf). If not specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to
            `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"`
            activation function.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention.
            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we
            recommend you to update this value accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        sliding_window (`int`, *optional*, defaults to 512):
            This is the size of the sliding window used by local attention layers.
        layer_types (`Optional`, *optional*):
            A sequence of strings defining the attention type for that layer as either "sliding_attention" or
            "full_attention". If not provided, `layer_types` will de inferred from `num_hidden_layers` using a pattern
            of four "sliding_attention" layers followed one "full_attention". The last layer in the model should always
            be a "full_attention" layer.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
            Scaling factor when applying tanh softcapping on the logits.
        altup_active_idx (`int`, *optional*, defaults to 0):
            The index of the prediction from which AltUp will compute additional predictions or correct
        altup_coef_clip (`float`, *optional*, defaults to 120.0):
            The maximum amplitude of an AltUp prediction or correction coeficient weight.
        altup_correct_scale (`bool`, *optional*, defaults to `True`):
            If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`.
        altup_num_inputs (`int`, *optional*, defaults to 4):
            The number of predictions that AltUp should be make given the input sequence.
        num_kv_shared_layers (`int`, *optional*, defaults to 15):
            The number of layer that share KV cache values. During the forward pass, the last `num_kv_shared_layers`
            layers in the model "share" the KV values in that each local and global layer in this range uses the KV
            cache values computed for the last local or global layer, respectively, before entering this range. The
            value should be `num_kv_shared_layers` should be a scalar of `sliding_window_pattern`.
        laurel_rank (int, *optional*, defaults to 64):
            The intermediate size for the linear projections in the Learned Augmented Residual Layer.
        activation_sparsity_pattern (Sequence[float], *optional*, defaults to `(0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)`):
            The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must
            explicitly provide a sparsity value for each layer in the model.

    ```python
    >>> from transformers import Gemma3nTextModel, Gemma3nTextConfig

    >>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration
    >>> configuration = Gemma3nTextConfig()

    >>> # Initializing a model from the gemma3n_text-E4B style configuration
    >>> model = Gemma3nTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3n_textN
vocab_sizevocab_size_per_layer_inputhidden_sizehidden_size_per_layer_inputintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_id
rope_thetarope_scalingrope_local_base_freqattention_biasattention_dropoutsliding_windowlayer_typesfinal_logit_softcappingaltup_active_idxaltup_coef_clipaltup_correct_scalealtup_num_inputsnum_kv_shared_layerslaurel_rankactivation_sparsity_patternc!                 x   t        j                  d|||d|! t        |t              r"t	        |      x}"|k7  rt        d| d|" d      t        |t              s|g|z  }|| _        || _        || _        || _	        || _
        || _        || _        |	| _        || _        || _        || _        || _        || _        || _        || _        |
| _        || _        || _        || _        || _        || _        t7        |        |6t9        | j                        D #cg c]  }#|#dz   dz  dk(  rdnd	 c}#| _        n|| _        t;        | j0                         || _        || _        || _         || _!        || _"        || _#        || _$        | d
g|z  } t	        |       x}$|k7  rt        d| d|$ d      | | _%        y c c}#w )N)r?   rA   r@   zjintermediate_size must have an explicit intermediate size for every layer or one for all layers. Expected z values but got .      r   full_attentionsliding_attention        zeactivation_sparsity_pattern must have an explicit activation sparsity value for every layer.Expected  )&r   __init__
isinstancer   len
ValueErrorr1   r2   r;   r3   r5   r6   r7   r9   r8   r<   r=   r>   rB   rE   rF   r:   rG   rI   rH   rD   rC   r   ranger   r4   rN   rJ   rK   rL   rM   rO   rP   )%selfr1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   kwargsintsize_lenilen_asps%                                        ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/gemma3n/modular_gemma3n.pyrY   zGemma3nTextConfig.__init__   s)   H 	!! 	
%%%	
 		
 '2sK\G]8]bs7s-..>{m1N  -x8!2 36G G$*D''>$&!2!2#6  #6 !2("$,!2!2,'>$&$8!(t$W\]a]s]sWt RSQUaK1$4 :MM D  +Dd../+F($8! 0.#6  0&&.+.%2C*C'677G<MM-..>wiqJ  ,G(5 s   F7) i           i @  #      r   rf   gelu_pytorch_tanhi   {Gz?ư>Tr   rS   r   g    .AN     @FrW   i   Ng      >@r   g      ^@T      @   )#ffffff?rp   rp   rp   rp   rp   rp   rp   rp   rp   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   rW   )__name__
__module____qualname____doc__
model_typeintr   r   strfloatboolr   dictr   rY   rX       rc   r/   r/   @   s3   L\  J "*1+.7=!##$#$!4'-#'"'15&.$#&!/3)- !!&$( !$&OiCfGfG %(fG 	fG
 &)fG !hsm!34fG fG !fG !fG fG fG "%fG !fG fG fG  !fG" #fG$ %fG& 'fG( tCH~.)fG* $+fG, -fG. !/fG0 1fG2 hsm,3fG4 "'5fG6 7fG8 9fG: ";fG< =fG> "?fG@ AfGB &.eE8E?4J.K%LCfGr{   r/   c            +            e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedededededed	ed
ededededededededeeef   dedeeeef   eeef   f   deeeef   eeef   f   f& fdZ xZ	S )Gemma3nAudioConfiga2  
    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate
    an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.,
    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).

    Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read
    the documentation from [`Gemma3nAudioConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 128):
            Vocabulary size of the additional hard-token embeddings for audio model. These augment the embeddings
            included in the `Gemma3nTextModel` to provide, e.g., the end of audio and audio soft token placeholder
            tokens when converting `input_ids` to embeddings in the `Gemma3nForConditionalGeneration` model.
        vocab_offset (`int`, *optional*, defaults to 262272):
            Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
            0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
        input_feat_size (`int`, *optional*, defaults to 128):
            The number of channels in each mel-spectrogram frame.
        hidden_size (`int`, *optional*, defaults to 1536):
            Dimension of the hidden representations.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        gradient_clipping (`float`, *optional*, defaults to 10000000000.0):
            Clipping value used to stablize extremely large gradient values.
        conf_attention_chunk_size (`int`, *optional*, defaults to 12):
            The sub-sequence size for local attention processing inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_attention_context_left (`int`, *optional*, defaults to 13):
            The left context size of the local attention inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_attention_context_right (`int`, *optional*, defaults to 0):
            The right context size of the local attention inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_attention_logit_cap (`float`, *optional*, defaults to 50.0):
            Logit cap applied during local attention inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_num_attention_heads (`int`, *optional*, defaults to 8):
            The number of attention heads in local attention inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_num_hidden_layers (`int`, *optional*, defaults to 12):
            The number of layers that use local attention inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_conv_kernel_size (`int`, *optional*, defaults to 5):
            Convolution kernel size for the conformer block inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_reduction_factor (`int`, *optional*, defaults to 4):
            Reduction factor used in the conformer block inside the Conformer ("conf") section of the
            Universal Speech Model.
        conf_residual_weight (`float`, *optional*, defaults to 0.5):
            Residual connection weight inside the Conformer ("conf") section of the
            Universal Speech Model.
        sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`):
            The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection
            ("sscp") section of the Universal Speech Model.
        sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001):
            Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution
            Projection ("sscp") section of the Universal Speech Model.
        sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`):
            Kernel sizes of the two convolutional layers in the subsample convolution projection  in the Sub-sample
            Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a
            tuple of height and width for each layer, where the height corresponds to the time dimension and the width
            corresponds to the frequency dimension.
        sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`):
            Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
            Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a
            tuple of height and width for each layer, where the height corresponds to the time dimension and the width
            corresponds to the frequency dimension.

    Example:

    ```python
    >>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder

    >>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration
    >>> configuration = Gemma3nAudioConfig()

    >>> # Initializing a model from the gemma3n_audio-E4B style configuration
    >>> model = Gemma3nAudioEncoder(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3n_audior1   vocab_offsetinput_feat_sizer3   r=   gradient_clippingconf_attention_chunk_sizeconf_attention_context_leftconf_attention_context_rightconf_attention_logit_capconf_num_attention_headsconf_num_hidden_layersconf_conv_kernel_sizeconf_reduction_factorconf_residual_weightsscp_conv_channel_sizesscp_conv_group_norm_epssscp_conv_kernel_sizesscp_conv_stride_sizec                 .   t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        y NrX   )superrY   r   r3   r=   r1   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r^   r1   r   r   r3   r=   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r_   	__class__s                        rc   rY   zGemma3nAudioConfig.__init__  s    : 	"6".&($(!2)B&+F(,H)(@%(@%&<#%:"%:"$8!&<#(@%%:"%:"r{   )     r   i   rk   g    _B      r   g      I@rh   r   rT   rm         ?)r       MbP?)r   r   r   )r   r   r   )
rq   rr   rs   rt   ru   rv   rx   tuplerY   __classcell__r   s   @rc   r}   r}   :  sT   Sj !J )""#3)++-,-*.()&(%&%&&)2;*.J
J
/0;0; 0; 	0;
 0; 0; !0; $'0; &)0; '*0; #(0; #&0; !$0;  #0;  #0;  $!0;" !&c3h#0;$ #(%0;&  %U38_eCHo%EF'0;.  %U38_eCHo%EF/0; 0;r{   r}   c                   `     e Zd ZdZdZ	 	 	 	 	 	 	 	 ddedededededed	ed
e	e
   f fdZ xZS )Gemma3nVisionConfiga7	  
    This is the configuration class to store the configuration for a timm backbone [`TimmWrapper`]. It is used to
    instantiate an timm model model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B
    vision tower, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).

    Configuration objects inherit from [`Gemma3nVisionConfig`] and can be used to control the model outputs. Read the
    documentation from [`Gemma3nVisionConfig`] for more information.

    Config loads imagenet label descriptions and stores them in `id2label` attribute, `label2id` attribute for default
    imagenet models is set to `None` due to occlusions in the label descriptions.

    Args:
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        do_pooling (`bool`, *optional*, defaults to `False`):
            Whether to do pooling for the last_hidden_state in `TimmWrapper` or not.
        architecture (`str`, *optional*, defaults to `"mobilenetv5_300m_enc"`):
            Determines vision architecture for TimmWrapper.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        vocab_size (`int`, *optional*, defaults to 128):
            Vocabulary size of the additional hard-token embeddings for vision model.
        vocab_offset (`int`, *optional*, defaults to 262144):
            Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
            0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.

    Example:
    ```python
    >>> from transformers import Gemma3nVisionConfig, TimmWrapper

    >>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration
    >>> configuration = Gemma3nVisionConfig()

    >>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration
    >>> model = TimmWrapper(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3n_visionr<   
do_poolingarchitecturer3   r1   r   r=   
model_argsc	                     t        
|   di |	 || _        || _        || _        || _        || _        || _        || _        y r   )	r   rY   r   r<   r   r3   r1   r   r=   )r^   r<   r   r   r3   r1   r   r=   r   r_   r   s             rc   rY   zGemma3nVisionConfig.__init__  sK     	"6"(!2$&$((r{   )rj   Fmobilenetv5_300m_encre   r   rd   rk   N)rq   rr   rs   rt   ru   rx   ry   rw   rv   r   rz   rY   r   r   s   @rc   r   r     s    *X "J $( 2##%)) ) ) 	)
 ) ) ) ) TN) )r{   r   c                        e Zd ZdZdZeeedZ	 	 	 	 	 	 	 	 	 	 	 	 dde	e
eeeef   f      de	e
eeeef   f      de	e
eeeef   f      deded	ed
edededededef fdZ xZS )Gemma3nConfiga}  
    This is the configuration class to store the configuration of a [`Gemma3nForConditionalGeneration`]. It is used to
    instantiate a Gemma3nForConditionalGeneration according to the specified arguments, defining the model
    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
    Gemma3n-E4B.

    e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3nTextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict.
        audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
            The number of soft tokens per audio clip.
        vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of soft tokens per image.
        boi_token_id (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_id (`int`, *optional*, defaults to 262144):
            The end-of-image token index to wrap the image prompt.
        image_token_id (`int`, *optional*, defaults to 262145):
            The image token index to encode the image prompt.
        boa_token_id (`int`, *optional*, defaults to 256000):
            The begin-of-audio token index to wrap the audio prompt.
        eoa_token_id (`int`, *optional*, defaults to 262272):
            The end-of-audio token index to wrap the audio prompt.
        audio_token_id (`int`, *optional*, defaults to 262273):
            The audio token index to encode the audio prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig

    >>> # Initializing a MobileNet vision config, which is loaded from TIMM
    >>> vision_config = Gemma3nVisionConfig()

    >>> # Initializing a Gemma3n Audio config
    >>> audio_config = Gemma3nAudioConfig()

    >>> # Initializing a Gemma3n Text config
    >>> text_config = Gemma3nTextConfig()

    >>> # Initializing a Gemma3n gemma-3-4b style configuration
    >>> configuration = Gemma3nConfig(text_config, vision_config, audio_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3nTextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3n)text_configvision_configaudio_configr   r   r   audio_soft_tokens_per_imagevision_soft_tokens_per_imageboi_token_ideoi_token_idimage_token_idboa_token_ideoa_token_idaudio_token_idr<   c                 :   t        |   di | t        |t              rt	        di |}n!|t	               }t
        j                  d       t        |t              rt        di |}n!|t               }t
        j                  d       t        |t              rt        di |}n!|t               }t
        j                  d       || _	        || _
        || _        || _        || _        || _        || _        || _        |	| _        |
| _        || _        || _        y )Nz5text_config is None. Using default Gemma3nTextConfig.z9vision_config is None. Using default Gemma3nVisionConfig.z7audio_config is None. Using default Gemma3nAudioConfig.rX   )r   rY   rZ   rz   r/   loggerinfor   r}   r   r   r   r   r   r   r   r   r   r   r   r<   )r^   r   r   r   r   r   r   r   r   r   r   r   r<   r_   r   s                 rc   rY   zGemma3nConfig.__init__P  s     	"6"k4(+:k:K +-KKKOPmT*/@-@M"/1MKKSTlD)-==L!-/LKKQR&*(+F(,H)((,((,!2r{   )NNN   rf   i rd   i  i  r   i  rj   )rq   rr   rs   rt   ru   r/   r   r}   sub_configsr   r   rz   rw   r   rv   rx   rY   r   r   s   @rc   r   r   
  s   <| J(,*K KONRLP+.,/##%##%#'03e$5tCH~$EFG03  &94S>&I JK03 u%7c3h%GHI	03
 &)03 '*03 03 03 03 03 03 03 !03 03r{   r   c                   :    e Zd ZU dZdZeej                     ed<   y)Gemma3nModelOutputWithPasta   
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Naudio_hidden_states	rq   rr   rs   rt   r   r   torchFloatTensor__annotations__rX   r{   rc   r   r     s     8<%"3"34;r{   r   c                   :    e Zd ZU dZdZeej                     ed<   y)Gemma3nCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nr   r   rX   r{   rc   r   r     s    & 8<%"3"34;r{   r   c                   l     e Zd Zd	dededef fdZd Zdej                  dej                  fdZ
 xZS )
Gemma3nRMSNormdimeps
with_scalec                     t         |   ||       | `|| _        | j                  r.t	        j
                  t        j                  |            | _        y | j                  dt        j                  d      d       y )Nr   weight      ?F
persistent)
r   rY   r   r   nn	Parameterr   onesregister_buffertensor)r^   r   r   r   r   s       rc   rY   zGemma3nRMSNorm.__init__  s^    #&K$??,,uzz#7DK  5<<+< Or{   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr   T)keepdim)r   sqrtpowmeanr   )r^   xs     rc   _normzGemma3nRMSNorm._norm  s4    5::aeeAhmmBm=HIIIr{   r   returnc                     | j                  |j                               | j                  j                         z  }|j                  |      S N)r   rx   r   type_as)r^   r   outputs      rc   forwardzGemma3nRMSNorm.forward  s9     AGGI&):):)<<~~a  r{   )rk   T)rq   rr   rs   rv   rx   ry   rY   r   r   Tensorr   r   r   s   @rc   r   r     sB    PC Pe P PJ! !%,, !r{   r   c                       e Zd Zdef fdZdej                  dej                  dej                  fdZdej                  de	d	e	d
e	de	de	de	dej                  fdZ
dej                  dej                  dej                  fdZ xZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                 R   t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j                  | j                  z  | _        t        d| j                  j                  dz
        | _
        | j                  j                  | _        t        j                  | j                  | j                  | j                  z  d      | _        d}d}| j                  dz  }t!        j"                  t%        |      t%        |      z        t        |dz
  d      z  }|t'        j(                  t'        j*                  |      | z        z  }| j-                  d|j%                         j/                  d      j/                  d      d	       y )
Nr   rS   Fbiasr   rl   r   inv_timescalesr   )r   rY   r   r   	num_headsr3   channelsr9   maxr   max_backwardr   max_forwardr   Linearpos_projmathlogrx   r   exparanger   	unsqueeze)r^   r   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr   r   s          rc   rY   z.Gemma3nAudioRelativePositionEmbedding.__init__  sL   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r{   positiondtyper   c                 P   |j                         j                  d      }|| j                  j                  |j                  t
        j                        z  }t        j                  t        j                  |      t        j                  |      gd      }|j                  |      S )Nr   )devicer   r   )rx   r   r   tor   r   float32catsincostype)r^   r   r   scaled_timetiming_signals        rc   _get_timing_signal_1d_posz?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos  s}    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r{   term_bd_before_shift
batch_sizer   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     |dz   |z
  }d|f}	t         j                  j                  ||	      }
|
j                  |||||dz   z  f      }|ddddddd||z  f   }|j                  |||||f      }|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        rS   r   N)r   
functionalpadreshape)r^   r  r  r   r	  r
  r  r  pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 rc   _relative_shiftz5Gemma3nAudioRelativePositionEmbedding._relative_shift  s    4  0!3F /0**+?O
 *11  $4q$89	
 *!Q3X5EHX5X3X*XY )00   
 r{   querieskeysc           	      R   |j                   \  }}}}}|j                   \  }}}	}}t        j                  | j                  | j                   dz
  d|j
                        j                  d      }
|
j                   d   }| j                  |
|j                        }| j                  |      }|j                  d|| j                  | j                        j                  d      }|j                  ddddd      }|j                  ddddd      }t        j                  ||      }|j                  ddddd      }|j                  ddd      }|j                  ||||z  |      }t        j                  ||      }|j                  |||||      }| j!                  ||||||	|      }||z   S )	NrS   r   r   r   r   r   r   rm   )shaper   r   r   r   r   r   r  r   r   r  r   r9   squeezepermutematmulr  )r^   r  r  r  r	  r
  r   r9   _r  pos_indicesr  sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr  s                           rc   r   z-Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
$&6	8'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >w}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
I?ORb?bdlm

 #(,,z:"F 3::
 ..
 ((r{   )rq   rr   rs   r}   rY   r   r   r   r  rv   r  r   r   r   s   @rc   r   r     s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L)r{   r   c                   $    e Zd Zdef fdZdej                  dededej                  fdZdej                  dej                  fd	Z	dej                  dej                  fd
Z
dej                  dej                  dej                  fdZ xZS )Gemma3nAudioAttentionr   c                    t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j
                  | j                  z  | _        | j                  j                  | _        | j                  j                  | _
        t        d| j                  j                  dz
        | _        | j                  j                  | _        | j                  | j                  z   | j                  z   | _        t#        |      | _        t'        j(                  t+        j,                  | j                  f            | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        | j                  dz  }dt*        j&                  j8                  j;                  t+        j<                  d            z  }| j?                  d||z  jA                         jC                         d	       t+        jD                  t+        jF                  | j                   | j                  ft*        jH                  
      d      jJ                  }t+        jD                  t+        jF                  | j                  | j                   ft*        jH                  
      | j                  | j                  z         }t+        jF                  | j                  | j                   ft*        jH                  
      }||z  |z  }| j?                  d|d	       | j?                  dt+        j<                  | j                        jM                         d	       y )Nr   rS   Fr         r   rW   q_scaler   r  )diagonallocal_causal_valid_masksoftcap)'r   rY   r   r   r   r3   r9   r   
chunk_sizer   max_future_horizonr   r   max_past_horizonr   attention_logits_soft_capcontext_sizer   relative_position_embeddingr   r   r   zerosper_dim_scaler   q_projk_projv_projr  softplusr   r   clonedetachtrilr   ry   Trx   )r^   r   r2  r_softplus_0lower_causal_maskupper_causal_maskr4  r   s          rc   rY   zGemma3nAudioAttention.__init__p  s   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY<)?(F(F(H(O(O(Q^cd!JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9<M"MPa"a68O\abLL778>>@ 	 	
r{   r   pad_left	pad_rightr   c                     |j                   ^}}}|j                  ||g|      }|j                  ||g|      }t        j                  |||gd      }|S )NrS   r   )r  	new_zerosr   r   )	r^   r   rI  rJ  batchr!  
tail_shapeleftrights	            rc   	_pad_dim1zGemma3nAudioAttention._pad_dim1  s^     !q:{{E89j9:UI;
;<IItQ&A.r{   hidden_statesc                 (   |j                   }|dd \  }}|| j                  z   dz
  | j                  z  }|| j                  z  |z
  x}dkD  r| j                  |d|      }||| j                  f|dd z   }|j                  |      j	                         }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr   rS   r   )r  r6  rQ  r  
contiguous)r^   rR  r  bt
num_blockspadding_lenpermute_dimss           rc   _convert_to_blockz'Gemma3nAudioAttention._convert_to_block  s     ##Ray1$//)A-$//A
%7!;;Kq@ NN=![IM:t7%)C%--l;FFHr{   c                 \   | j                   }| j                  | j                  z   dz
  }| j                  |||      }| j                  }| j                  }|j                  d||      }|j                  dkD  r'|j                  dkD  rt        j                  |dd      }|j                         S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        rS   )	dimensionsizestepr   r   r   )sourcedestination)
r8  r7  r6  rQ  r:  unfoldndimr   movedimrT  )r^   rR  rI  rJ  	frame_len
frame_step
x_unfoldeds          rc   _extract_block_contextz,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}h	J%%	__
 #))AIJ)W
 !joo&9 z"!LJ$$&&r{   maskc                 	   g |j                   d d | j                  | j                  }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }t        j                  j                  j                  | j                        }ddd| j                  f}|j                  |      }	|| j                  z  |	z  }|j                   d d \  }
}| j                  |      }| j!                  |      }| j!                  |      }|j                   d   }| }| j!                  |      }|j"                  dk(  rI|j                   d   |j                   d   z  | j$                  k(  r|j	                  |
|| j$                        }|j                   |
|| j$                  fk7  r,t'        d|j                    d|
 d| d| j$                   d		      |j)                  d      j)                  d
      }| j*                  j)                  d      j)                  d      j)                  d      }t        j,                  ||j/                  |j0                              }| j3                  ||      }| j4                  j/                  |j0                        }||z  }t        j6                  |      }||z  }t        j8                  ||t        j:                  |j<                        j>                        }t        j                  j                  jA                  |dt        jB                        j/                  |j<                        }|j                   \  }}}}}|j                   d   }|jE                  ddddd      j	                  d||      }|jE                  ddddd      j	                  d||      }t        jF                  ||      } | j	                  |||||      jE                  ddddd      }!|!j	                  |
|| jH                  z  | j                  | j                  f      }!|!d d d |f   }!|!S )Nr   rS   r   rm   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   )r   r   r  )%r  r   r9   r>  r  rT  r?  r@  r   r   r  rA  r=  viewr2  rZ  rg  rb  r:  r\   r   r4  logical_andr   r   r;  r5  tanhwherefinfor   minsoftmaxr   r  bmmr6  )"r^   rR  rh  	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr  q_timequery_blocks
key_blocksvalue_blocksr	  original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherelogitssoftcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     rc   r   zGemma3nAudioAttention.forward  sT   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#dll25OO)//3
F--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*M,d.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,
K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 4??2	
 *!WfW*5r{   )rq   rr   rs   r}   rY   r   r   rv   rQ  rZ  rg  
BoolTensorr   r   r   s   @rc   r/  r/  o  s    )
1 )
V5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell dr{   r/  c                   r     e Zd ZdZ	 d	dedee   def fdZdej                  dej                  fdZ
 xZS )
Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    num_channelsfeature_dimsr   c           	         t         |           || _        t        |      | _        || _        t        j                  t        j                  |            | _
        t        t        ddt        | j                        z   dz               | _        y )Nr   rS   )r   rY   r  r   r  r   r   r   r   r   r   r]   r[   reduction_axes)r^   r  r  r   r   s       rc   rY   z(Gemma3nAudioCumulativeGroupNorm.__init__`  sr     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr{   rR  r   c                    | j                   | j                  fz   }|j                  dd |k7  rt        d|j                  dd  d|       |j                  }t
        j                  }|j                  |      }t        j                  ||      }t        j                  || j                  d      }t        j                  |d	      }t        j                  || j                  d      }	t        j                  |	d	      }
t        j                  |
d
      }||z  }||z
  j                  d      }t        j                  || j                  d      }t        j                  |d	      }||z  }||z
  t        j                  || j                  z         z  }| j                   j                  |      }dg|j#                         dz
  z  | j                  gz   }||j%                  |      z  }||z  }|j                  |      S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r  Tr   r   rS   r   r   )rp  )r  r  r  r\   r   r   r   r   	ones_likesumr  cumsumclampr   rsqrtr   r   r   rk  )r^   rR  expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        rc   r   z'Gemma3nAudioCumulativeGroupNorm.forwardr  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF*=	  ))F0C0CTRo1= "'9$:M:MW[!\"\\*@aH"'++.@c"J "$;;
 #)8"3!8!8!; 99%;ATAT^bc  ,,'7Q? ')@@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r{   )r   )rq   rr   rs   rt   rv   r   rx   rY   r   r   r   r   r   s   @rc   r  r  O  sT    ( 	NN smN 	N$G,U\\ G,ell G,r{   r  c                   ~     e Zd ZdZ	 d
dedededeeeeef   f fdZdej                  dej                  fd	Z
 xZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    r   idxinput_freq_dimmanual_paddingc                 J   t         |           || _        || _        |dk(  rdn| j                  j                  |dz
     }| j                  j                  |   }| j                  j
                  |   \  }}| j                  j                  |   \  }	}
t        j                  ||||f|	|
fdd      | _	        || j                  d   z   | j                  d   z   }||z
  |
z  dz   }t        ||f| j                  j                        | _        t        j                         | _        y )Nr   rS   )r   r   F)in_channelsout_channelskernel_sizestridepaddingr   )r  r  r   )r   rY   r   r  r   r   r   r   Conv2dconvr  r   normReLU
activation)r^   r   r  r  r  r  r  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convr   s                rc   rY   z"Gemma3nAudioSSCPConvBlock.__init__  s%    	, !8a)K)KCRSG)T{{99#>![[>>sC(![[>>sC(II#% h'

	 %t':':1'==@S@STU@VV!H,9A=
3%$44
	 '')r{   audio_encodingsr   c                 6   t        j                  || j                  dd      }| j                  |      }|j	                  dddd      j                         }| j                  |      }|j	                  dddd      j                         }| j                  |      S )NconstantrW   )modevaluer   r   r   rS   )Fr  r  r  r  rT  r  r  )r^   r  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          rc   r   z!Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r{   ))r   r   r   r   )rq   rr   rs   rt   r}   rv   r   rY   r   r   r   r   r   s   @rc   r  r    sc     5A)$")$ )$ 	)$
 c3S01)$V7u|| 7 7r{   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )#Gemma3nAudioSubSampleConvProjectionr   c                 p   t         |           || _        |j                  }g }g }t	        d      D ]n  }|j
                  |   \  }}|j                  |   \  }}	d}
|dz
  }d}d}|||
|f}|j                  |       ||z   |z   }||z
  |	z  dz   }|j                  |       |}p t        d|j                  ||d         | _	        t        d|d   ||d         | _
        |j                  d   }|d   }||z  | _        t        j                  | j                  | j                  j                  d      | _        y )Nr   r   rS   )r  r  r   r  r   Fr   )r   rY   r   r   r]   r   r   appendr  conv_0conv_1r   input_proj_in_featuresr   r   r3   input_proj_linear)r^   r   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsra   r  r  r  r  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tupler  f_out_after_convfinal_c_outfinal_f_outr   s                      rc   rY   z,Gemma3nAudioSubSampleConvProjection.__init__   s   $*$:$:!#%  "q 	9A!'!=!=a!@Hh!'!=!=a!@Hh I#a<L JK 	$  %++,@A 4j@;NK +h 68CaG!(()9:(8%=	9@ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr{   r  r   c                     |j                  d      }| j                  |      }| j                  |      }|j                  \  }}}}|j	                  dddd      j                         }|j                  ||||z        }	| j                  |	      }
|
S )NrS   r   r   r   )r   r  r  r  r  rT  rk  r  )r^   r  audio_encodings_reshapedr   rU  c_outt_outf_out
x_permutedoutput_flattenedr   s              rc   r   z+Gemma3nAudioSubSampleConvProjection.forward9  s     $3#<#<Q#? KK01KKN!"5%YYq!Q*557
%??1eUU]C''(89r{   	rq   rr   rs   r}   rY   r   r   r   r   r   s   @rc   r  r    s.    7m1 7mru||  r{   r  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerAttentionr   c                    t         |           || _        | j                  j                  | _        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _
        t        |      | _        t        j                  | j                  | j                  j                  d      | _        t        | j                  j                        | _        y )Nr   Fr   r   )r   rY   r   r3   post_in_featuresr   r   r   r   r   pre_attn_normr/  attnr   r   post	post_normr^   r   r   s     rc   rY   z'Gemma3nAudioConformerAttention.__init__I  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r{   r  audio_mel_maskr   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  ||      }|j
                  \  }}}}	|j                  ||||	z        }
| j                  |
      }t        j                  || j                   | j                        }|| j                  |      z   S r   )	r   r  r   r  r  r  r  r  r  )r^   r  r  audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outrU  rV  r   r9   r  s              rc   r   z&Gemma3nAudioConformerAttention.forwardS  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A>#R  %=$B$B!1i#;#C#CAq)V^J^#_ ))$<=++o8N8N7NPTPfPfg,t~~o/NNNr{   
rq   rr   rs   r}   rY   r   r   r  r   r   r   s   @rc   r  r  H  sA    A1 AOu|| OUEUEU OZ_ZfZf Or{   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerFeedForwardr   c                    t         |           || _        | j                  dt	        j
                  | j                  j                        d       t        | j                  j                        | _	        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  dz  | j                  j                  d      | _        t        | j                  j                        | _        t	        j
                  | j                  j                        | _        y )Nr   Fr   rm   r   )r   rY   r   r   r   r   r   r   r3   pre_layer_normr   r   ffw_layer_1ffw_layer_2post_layer_normr   post_layer_scaler  s     rc   rY   z)Gemma3nAudioConformerFeedForward.__init__e  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF %T[[-M-M Nr{   r  r   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  |      }t
        j                  j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }||| j                  z  z   S r   )r   r  r   r  r   r   r  silur  r  r  )r^   r  residuals      rc   r   z(Gemma3nAudioConformerFeedForward.forwardq  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..??T-B-BBCCr{   r  r   s   @rc   r  r  d  s0    
O1 
O	Du|| 	D 	Dr{   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerLightConv1dr   c           	         t         |           || _        t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  dz  d      | _	        t        j                  | j                  j                  | j                  j                  | j                  j                  dd| j                  j                  d      | _        | j                  dt        j                  | j                  j                         d	       t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  d      | _        | j                  j                  dz
  | _        y )
Nr   r   Fr   rS   r   )r  r  r  r  r  groupsr   r   r   )r   rY   r   r   r3   r=   r  r   r   linear_startConv1dr   depthwise_conv1dr   r   r   r   	conv_norm
linear_endcausal_paddingr  s     rc   rY   z)Gemma3nAudioConformerLightConv1d.__init__~  sD   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr{   r  r   c                 :   |}| j                  |      }| j                  |      }t        j                  j                  j                  |d      }|j                  ddd      }t        j                  || j                  df      }| j                  |      }|j                  ddd      }t        j                  || j                   | j                        }| j                  |      }t        j                  j                  |      }| j                  |      }||z   }|S )Nr   r   r   r   rS   )r  r  r   r   r  glur  r  r  r  r  r  r   r  r  r  )r^   r  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr   s         rc   r   z(Gemma3nAudioConformerLightConv1d.forward  s   #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0H4K^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: #;;r{   r  r   s   @rc   r  r  }  s-    D1 D*u||  r{   r  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerBlockr   c                    t         |           || _        t        | j                        | _        t        | j                        | _        t        | j                        | _        t        | j                        | _	        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _        y )Nr   Fr   )r   rY   r   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endr   r   r   r   r   r3   r  r  s     rc   rY   z#Gemma3nAudioConformerBlock.__init__  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r{   r  r  r   c                 j   | j                  |      }| j                  ||      }| }||j                  d      j                  |j                        z  }| j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }|S )Nr   )r  r  r   r   r   r  r  r   r  r   r  )r^   r  r  validity_mask_for_lconvaudio_encodings_for_lconv_inputr   s         rc   r   z"Gemma3nAudioConformerBlock.forward  s    ..?...I#1/*9<S<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r{   r  r   s   @rc   r  r    s;    	<1 	<u|| UEUEU Z_ZfZf r{   r  c                        e Zd ZU dZeed<   dZdef fdZdej                  dej                  deej                  ej                  f   fdZ xZS )Gemma3nAudioEncoderzfAn audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.r   	audio_melc                     t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _
        y c c}w r   )r   rY   r   r  subsample_conv_projectionr   
ModuleListr]   r   r  	conformer)r^   r   r!  r   s      rc   rY   zGemma3nAudioEncoder.__init__  sV     )LV)T&9>v?\?\9]^A'/^
^s   A-r  r   c                 ,   | j                  |      }|j                  d   }d}t        t        | j                  j
                              D ]!  }|| j                  j
                  |   d   z  }# t        j                  ||j                        |z  }t        j                  ||j                  d   dz
        }|j                  dkD  r>|j                  dk(  r/|j                  d      j                  |j                  d   d      }n`|j                  |j                  k(  rG|j                  d   dk(  r5|j                  d   dk7  r#||j                  d   k(  r|j                  d      }t        j                  |d|      }| j                  D ]  }	 |	||      } | j                  j                  dkD  r@|dddd| j                  j                  f   }|dddd| j                  j                  f   }|j!                  |j                  d      d      }||fS )a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        rS   r   r  )r   r   NrW   )r$  r  r]   r[   r   r   r   r   r   r  rb  r   expandgatherr&  r   masked_fill)
r^   r"  r  r  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks
             rc   r   zGemma3nAudioEncoder.forward  s    88C  %%a($S)J)J%KL 	YO4;;#D#D_#UVW#XX	Y ,,u^-B-BCFYY++g>+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^ 	CE#O\BO	C ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV,,r{   )rq   rr   rs   rt   r}   r   main_input_namerY   r   r   r  r   r   r   r   s   @rc   r!  r!    sY    p!O
1 
5-5-7<7G7G5-	u||U---	.5-r{   r!  c                       e Zd Zy)Gemma3nTextScaledWordEmbeddingNrq   rr   rs   rX   r{   rc   r3  r3        r{   r3  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )Gemma3nTextLaurelBlockz Learned Augmented Residual Layerr   c                    t         |           || _        t        j                  | j                  j
                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j
                  d      | _        t        | j                  j
                  | j                  j                        | _        y )NFr   r   )r   rY   r   r   r   r3   rO   linear_leftlinear_rightr   r=   post_laurel_normr  s     rc   rY   zGemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er{   rR  r   c                 r    | j                  |      }| j                  |      }| j                  |      }||z   S r   )r9  r:  r;  )r^   rR  laurel_hidden_statesnormed_laurel_hidden_statess       rc   r   zGemma3nTextLaurelBlock.forward  sC    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#:::r{   )
rq   rr   rs   rt   r/   rY   r   r   r   r   r   s   @rc   r7  r7    s0    *f0 f;U\\ ;ell ;r{   r7  c                        e Zd Zd	dedef fdZdej                  dej                  fdZdej                  dej                  fdZ	 xZ
S )
Gemma3nTextMLPr   	layer_idxc                 t    t         |   |       |j                  |   | _        |j                  |   | _        y r   )r   rY   r5   rP   activation_sparsityr^   r   rA  r   s      rc   rY   zGemma3nTextMLP.__init__%  s6     !'!9!9)!D#)#E#Ei#P r{   rR  r   c                     | j                  |      }| j                  dkD  r| j                  |      }| j                  |      }| j	                  |      }| j                  ||z        }|S )NrW   )	gate_projrC  _gaussian_topkact_fnup_proj	down_proj)r^   rR  rF  activationsrI  rJ  s         rc   r   zGemma3nTextMLP.forward*  sc    NN=1	##c)++I6Ikk),,,}-NN;#89	r{   inputsc                    t        j                  | j                  t         j                  |j                        }t         j
                  j                  j                  dd      }|j                  |      }|j                  |j                        }t        j                  |dd      }t        j                  |ddd      }|||z  z   }t        j                  j                  ||z
        S )	Nr   r   r   rS   r   Tr  F)r   r   unbiased)r   r   rC  r   r   distributionsnormalNormalicdfr  r   r   stdr   r  relu)r^   rL  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           rc   rG  zGemma3nTextMLP._gaussian_topk3  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&8"344r{   )r   )rq   rr   rs   r/   rv   rY   r   r   r   rG  r   r   s   @rc   r@  r@  $  sP    Q0 QS Q
U\\ ell 5U\\ 5ell 5r{   r@  c                   X    e Zd ZdZdef fdZdej                  dej                  fdZdej                  dej                  fdZ	d	ej                  d
ej                  dej                  fdZ
dej                  dej                  fdZdej                  dej                  fdZ xZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    r   c                 F   t         |           || _        t        j                  t        j                  | j                  j                              | _        t        j                  | j                  j                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  | j                  j                  d      | _        t        | j                  j                  | j                  j                        | _        | j#                  dt        j$                  | j                  j                  dz        d       y )NFr   r   r   router_input_scaleg      r   )r   rY   r   r   r   r   r<  r3   correct_output_scaler   rM   correction_coefsprediction_coefsmodality_routerr   r=   router_normr   r   r  s     rc   rY   zGemma3nTextAltUp.__init__P  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr{   r   r   c                     | j                  |      | j                  z  }| j                  |      }t        j                  |j                               j                  |      S r   )rd  r_  rc  r   rm  rx   r   )r^   r   router_inputsrouteds       rc   compute_router_modalitiesz*Gemma3nTextAltUp.compute_router_modalitiesZ  sM    ((+d.E.EE%%m4zz&,,.)11!44r{   rR  c                    | j                  || j                  j                           }| j                  ro| j                  j                  Y| j
                  j                  j                  j                  | j                  j                   | j                  j                          | j                  |      j                  g |j                  dd | j                  j                  | j                  j                   j                  dddd      }t        j                  |j                  dddd      |      }|j                  dddd      }||z  }|j                         j!                  |      S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        Nr   r   rS   r   r   )rh  r   rJ   trainingrK   rb  r   dataclamp_r  r  rM   r  r   r   rT  r   )r^   rR  
modalities	all_coefspredictionss        rc   predictzGemma3nTextAltUp.predict_  s@    33M$++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSnoD!!*-Wi &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15}$%%'//>>r{   ro  	activatedc                 j   | j                  |      }||| j                  j                     z
  }|j                  | j                  j                  ddd      }| j                  j
                  Y| j                  j                  j                  j                  | j                  j
                   | j                  j
                         | j                  |      dz   }|j                  ddd      j                  d      }t        j                  ||      }||z  }|j                         j                  |      S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        rS   r   r   r   r   )rh  r   rJ   repeatrM   rK   ra  r   rk  rl  r  r   r   mulrT  r   )r^   ro  rq  rm  
innovationrn  	correcteds          rc   correctzGemma3nTextAltUp.correct{  s    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
;;&&2!!((--44dkk6Q6Q5QSWS^S^SnSno
 #'"7"7
"Cc"I	%%aA.88<	IIj)4	[ 	##%--i88r{   rv  c                 p    |j                  | j                        | j                  z  j                  |      S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )r   r`  r^   rv  s     rc   r   zGemma3nTextAltUp.forward  s2     !!$";";<t?X?XXaabkllr{   c                 $    | j                  |      S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)r   ry  s     rc   scale_corrected_outputz'Gemma3nTextAltUp.scale_corrected_output  s    ||I&&r{   )rq   rr   rs   rt   r/   rY   r   r   rh  rp  rw  r   r{  r   r   s   @rc   r]  r]  D  s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9:m m%,, m' ' 'r{   r]  c                       e Zd Zy)Gemma3nTextRotaryEmbeddingNr4  rX   r{   rc   r}  r}    r5  r{   r}  r   r  r  position_idsunsqueeze_dimc                 n    |j                  |      }|j                  |      }| |z  t        |       |z  z   S )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r"   )r   r  r  r~  r  s        rc   apply_rotary_pos_embr    s8    2 --
&C
--
&CGA,--r{   c                       e Zd Zdedef fdZ	 	 ddej                  dej                  deej                     dee	   deej                     d	ee   d
eej                  eej                     eeej                        f   fdZ xZS )Gemma3nTextAttentionr   rA  c                    t         |           | `| `t	        |j
                  |j                  d      | _        | j                  j                  | j                  j                  z
  }||cxk\  xr dkD  nc | _        |j                  |   }| j                  r0|dz
  |j                  |dz
  d d   j                  |      z
  | _        y d | _        y )NF)r   r   r   r   rS   r   )r   rY   attn_logit_softcappingscalingr   r9   r=   v_normr   r6   rN   is_kv_shared_layerrH   indexkv_shared_layer_index)r^   r   rA  first_kv_shared_layer_idx
layer_typer   s        rc   rY   zGemma3nTextAttention.__init__  s    'L$f>Q>Q^cd$(KK$A$ADKKDdDd$d!"+/H"L1"L''	2
 && &)F,>,>?X[\?\?b`b?b,c,i,ijt,uu 	"  	"r{   rR  position_embeddingsattention_maskpast_key_valuecache_positionr_   r   c                 >   |j                   d d }g |d| j                  j                  }|\  }	}
| j                  |      j	                  |      }| j                  |      }t        ||	|
d      }|j                  dd      }| j                  r| j                  ||j                  | j                     }|j                  |j                  j                        }t        |t              r_|j                   d   |j!                         kD  rt#        d|j!                               }n$|j%                  d|j!                         dz
        }|j                  d d d d |f   j                  |j                        }|j&                  d d d d |f   j                  |j                        }n| j)                  |      j	                  |      }| j+                  |      }t        ||	|
d      }|j                  dd      }| j-                  |      j	                  |      }| j/                  |      }|j                  dd      }|2|
|	|| j0                  d}|j3                  ||| j4                  |      \  }}t6        }| j                  j8                  dk7  rt:        | j                  j8                     } || ||||f| j<                  r| j>                  nd	d
| j0                  d|\  }} |j@                  g |d jC                         }| jE                  |      }||fS )Nr   r   )r  rS   r   )rp  r   )r  r  r  rG   eagerrW   r   )dropoutr  rG   )#r  r   r9   r>  rk  q_normr  	transposer  r  layersr   r  r   rZ   r   get_max_cache_shapeslicer  valuesr?  k_normr@  r  rG   updaterA  r!   _attn_implementationr   rj  rF   r  rT  o_proj)r^   rR  r  r  r  r  r_   input_shapehidden_shaper  r  rt  layerr.  ru  rv  cache_kwargsattention_interfaceattn_outputattn_weightss                       rc   r   zGemma3nTextAttention.forward  s    $))#2.??b?$++*>*>?&S{{=166|D{{<0+L#sRST#--a3""t'A'A'MR`Rl"))$*D*DEE$''

(9(9:G%!34!''*U-F-F-HH#Au'@'@'BCG%mmu7P7P7RUV7VmWG Aq'M255l6I6IJJ <<1g699,:M:MNL]388FJZ0J-j#sRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L% "0"&"5"5	L (6'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**..
%
 
%
!\ *k));;;;FFHkk+.L((r{   NN)rq   rr   rs   r/   rv   rY   r   r   r   r
   
LongTensorr   r   r   r   r   r   s   @rc   r  r    s    
0 
S 
* +/59H)||H) #\\H) !.	H)
 !H) !!1!12H) -.H) 
u||Xell3XeELL>Q5RR	SH)r{   r  c                   p    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dej                  dej                  deej                     d	eej                     d
ee
   dee   dee   deej                     deej                  eeej                  ej                  f      f   fdZ xZS )Gemma3nTextDecoderLayerr   rA  c                    t         |   ||       t        ||      | _        |j                  | _        t
        |j                     | _        t        |      | _	        t        |      | _        t        ||      | _        t        j                  | j                   | j                  d      | _        t        j                  | j                  | j                   d      | _        t'        | j                   |j(                        | _        y )N)rA  Fr   r   )r   rY   r@  mlpr4   r	   r:   rH  r]  altupr7  laurelr  	self_attnr   r   r3   per_layer_input_gateper_layer_projectionr   r=   post_per_layer_input_normrD  s      rc   rY   z Gemma3nTextDecoderLayer.__init__$  s    +!&I>+1+M+M(V556%f-
,V4-fi@$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r{   rR  position_embeddings_globalposition_embeddings_localper_layer_inputr  r~  r  output_attentionsr>   r  r   c                    | j                   j                  |      }|| j                  j                     }| j	                  |      }| j                  |      }| j                  j                  r|}n|} | j                  d|||||||	|
d|\  }}| j                  |      }||z   }||z   t        j                  d      z  }| j                  |      }| j                  |      }| j                  |      }||z   }| j                   j                  ||      }|| j                  j                     j                         }| j                  j                   r| j                   j#                  |      }| j%                  |      }| j'                  |      }t)        j*                  ||      }| j-                  |      }| j/                  |      }|dd xxx |z  ccc |f}|r||fz  }|S )N)rR  r  r  r~  r  r  r>   r  r   rS   rX   )r  rp  r   rJ   input_layernormr  r  
is_slidingpost_attention_layernormr   r   pre_feedforward_layernormr  post_feedforward_layernormrw  rB  rL   r{  r  rH  r   multiplyr  r  )r^   rR  r  r  r  r  r~  r  r  r>   r  r_   ro  active_predictionactive_prediction_normedlaurel_outputr  r  self_attn_weights
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionoutputss                               rc   r   zGemma3nTextDecoderLayer.forward2  s    jj((7'(D(DE#'#7#78I#J $<= >>$$";"<"0$.. 
#
2 3)%)/)
#
 
#
 ,,T2&-
!M1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!(*)++Gr{   )NNNFFN)rq   rr   rs   r/   rv   rY   r   r   r   r  r
   ry   r   r   r   r   r   s   @rc   r  r  #  s   c0 cS c( 2637*.,1$)59C||C %*LLC $)<<	C
 C !.C u//0C !C $D>C D>C !!1!12C 
u||XeE,=,=u?P?P,P&QRR	SCr{   r  c                   (    e Zd ZU eed<   dZdgZd Zy)Gemma3nPreTrainedModelr    r  c                 n   t        j                  |       t        |t              r&|j                  j
                  j                  d       y t        |t              r%|j                  j
                  j                          y t        |t              r%|j                  j
                  j                          y y )Nr   )r   _init_weightsrZ   r  r   rk  fill_r/  r=  zero_r]  r`  )r^   modules     rc   r  z$Gemma3nPreTrainedModel._init_weights}  s~    ++F3f=>MM$$S) 56  %%++- 01'',,224 2r{   N)rq   rr   rs   r   r   base_model_prefix_no_split_modulesr  rX   r{   rc   r  r  x  s    235r{   r  zBThe base Gemma 3n language model without a language modeling head.)custom_introc                       e Zd ZU eed<   def fdZdej                  dej                  fdZ		 ddej                  de
ej                     dej                  fdZee	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     d	e
ej                     d
e
ej                     de
e   de
ej                     de
e   de
e   de
e   de
ej                     dee   defd              Z xZS )Gemma3nTextModelr   c           
         t         |   |       |j                  | _        |j                  | _        t	        |j
                  |j                  |j                  z  | j                  |j                  dz        | _        t        j                  | j                  |j                  |j                  z  d      | _        t        |j                  |j                        | _        t        j                  t!        |j                        D cg c]  }t#        ||       c}      | _        t        |j                  |j                        | _        t        j                  t!        d| j(                  j*                        D cg c].  }t        j                  | j                  | j                  d      0 c}      | _        t        j                  t!        d| j(                  j*                        D cg c].  }t        j                  | j                  | j                  d      0 c}      | _        | j1                  dt3        j4                  | j                  dz        d	       | j1                  d
t3        j6                  t3        j4                  d            d	       t9        |      | _        t=        j>                  |      }|j@                  |_!        ddi|_"        t9        |      | _#        y c c}w c c}w c c}w )Nr   )embed_scaleFr   r   rS   per_layer_projection_scaler1  r   per_layer_input_scaleg       @)r   	rope_typedefault)$r   rY   r3   r4   r3  r2   r6   padding_idxembed_tokens_per_layerr   r   per_layer_model_projectionr   r=   per_layer_projection_normr%  r]   r  r  r  r   rM   altup_projectionsaltup_unembed_projectionsr   r   r   r  r}  
rotary_embcopydeepcopyrD   rB   rC   rotary_emb_local)r^   r   rA  r!  r   s       rc   rY   zGemma3nTextModel.__init__  sT    !--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&mmINvOgOgIhiI$VY7i
 #6#5#56;N;NO	!#PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw"
 *,PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg4FC
 v&"77*I6 :& I/ j x xs   	K.?3K3/3K8	input_idsr   c                      | j                  |      j                  g |j                  | j                  j                  | j
                   S r   )r  r  r  r   r6   r4   )r^   r  s     rc   get_per_layer_inputsz%Gemma3nTextModel.get_per_layer_inputs  sP    =t**95== 
__
KK))
 ,,
 	
r{   inputs_embedsper_layer_inputsc                    | j                  |      }|| j                  j                  |j                  |j                        z  } |j
                  g |j                  d d | j                  j                  | j                   }| j                  |      }||S |j                  |j                  k7  r |dd | j                  j                  d d f   }||z   | j                  j                  |j                  |j                        z  S )NrN  r   .)r  r  r   r   r   r  r  r   r6   r4   r  r  )r^   r  r  r  s       rc   project_per_layer_inputsz)Gemma3nTextModel.project_per_layer_inputs  s.   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  <3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$'774;U;U;X;X%%.B.I.I <Y <
 
 	
r{   r  r~  past_key_valuesr>   r  output_hidden_statesr  r_   c                 P	   ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|du |duz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|"| j                  |      }| j                  |      }| j                  ||      }|r|| j                  s
t               }|
F||j                         nd}t        j                  |||j                   d   z   |j"                        }
||
j%                  d      }t'        |x}t(              s*| j                   |||
||d}t+        di |t-        di |d	}|}| j/                  ||      }| j1                  ||      }t        j2                  |d
z  dd      dz  }t        j4                  d      }|g}t7        d| j                   j8                        D ]  } | j:                  |dz
     |      }|j=                  |j>                  |j"                        }t        j2                  |d
z  dd      }t        j@                  t        jB                  ||j=                  |j"                                    }||z  |z  }|jE                  |        t        jF                  |d      }|	rdnd}|rdnd}| jH                  d| j                   jJ                   D ]V  }|	r||fz  }||jL                     }|dddd|jN                  ddf   } |||||f||||||
d|}|d   }|sN||d   fz  }X |	r||fz  }t        j2                  |d   d
z  dd      dz  }|d   g}t7        d| j                   j8                        D ]  } | jP                  |dz
     ||         } | j=                  |j>                  |j"                        }t        j2                  |d
z  dd      }t        j@                  t        jB                  ||j=                  |j"                                    }||z  |z  }|jE                  |        t        jF                  |      }t        j2                  |d      }| jS                  |      }tU        ||||      S )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   rS   r  )r   input_embedsr  r  r  r~  )rU   rV   r   r   Tr  r   gh㈵>rN  r   rX   )r  r~  r  r  r>   r  )last_hidden_stater  rR  
attentions)+r   r  r  r>   r\   gradient_checkpointingrj  r   warning_onceembed_tokensr  r  r   get_seq_lengthr   r   r  r   r   rZ   rz   r   r   r  r  r   r   r]   rM   r  r   r   r   maximumr  stackr  r6   attention_typerA  r  r  r   )!r^   r  r  r  r~  r  r  r>   r  r  r  r_   past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0r  r  target_magnitudeepsilon_tensortemp_hidden_statesra   
altup_projcurrent_hidden_statenew_magnituderR  all_hidden_statesall_self_attnsdecoder_layercausal_maskr  layer_outputsaltup_unemb_projs!                                    rc   r   zGemma3nTextModel.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M#88C88HXY0*nO!CRC^==?de"\\  =#6#6q#99$++N )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%F%U%U# ( &*___l%S"$($9$9/<$X! !::oq&8b$OSVVd+-.q$++667 	<A6//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $6A> #7BD0d![[)H4;;+H+HI 	6M#!m%55!-m.J.JKK.q!]5L5La/OPO)*)	
  +)."3#- M *!,M =#3"551	66  -!11 !::mA&6!&;TRVYY+A./q$++667 	<A-RT-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $67

=a8		-0&+++%	
 	
r{   r   )
NNNNNNNNNN)rq   rr   rs   r/   r   rY   r   r  r   r  r   r  r   r   r
   r   ry   r   r   r   r   r   r   s   @rc   r  r    s   ,J0 ,J\
e.>.> 
5<< 
 48
||
 #5<<0
 
	
6  15371537+/59$(,0/359T
E,,-T
 #5<<0T
 !.	T

 u//0T
 "%T
   1 12T
 D>T
 $D>T
 'tnT
 !!1!12T
 +,T
 
!T
  T
r{   r  z?The base Gemma 3n language model with a language modeling head.c                       e Zd ZddiZdZy)Gemma3nForCausalLMzmodel.language_modelmodelN)rq   rr   rs   _checkpoint_conversion_mappingr  rX   r{   rc   r  r  t  s    &<g%F"r{   r  c                        e Zd ZdZdeeef   def fdZ	 	 d	de	e
j                     de	e
j                     de
j                  fdZ xZS )
Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configr   c                 r   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                        | _        t        | j                  | j
                        | _        t        | j                  | j
                        | _        t        j                  | j                  | j                  d      | _        t        | j                  | j
                  d      | _        y )Nr   Fr   )r   r   )r   rY   r3   multimodal_hidden_sizer=   r   r   r1   text_hidden_sizer   	Embedding	embeddingr   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)r^   r  r   r   s      rc   rY   z"Gemma3nMultimodalEmbedder.__init__}  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r{   r  r  r   c                     |du |duz  rt        d      || j                  |      }n/| j                  || j                  z
        }| j	                  |      }| j                  |      }| j                  |      S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r\   r  r  r   r  r  r  )r^   r  r  emb_normhard_embemb_norm_projs         rc   r   z!Gemma3nMultimodalEmbedder.forward  s     -t";<YZZ$//>H~~i$2C2C&CDH//9H11(;22=AAr{   r  )rq   rr   rs   rt   r   r}   r   r/   rY   r   r   r  r   r   r   r   s   @rc   r  r  z  sq    [t !35H!HIt 't* 1504BE,,-B  -B 
	Br{   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                        e Zd Zi Zdef fdZdej                  dej                  fdZdej                  dej                  dej                  d	ej                  fd
Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deeeej                     ef      deej                     deej                     deej                     deej                     dee   dee   dee   defd       Zdej                  dej                  deej                  ej                  f   fdZd Z xZS )Gemma3nModelr   c                 8   t         |           | `|j                  j                  | _        t        j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y r   )r   rY   multi_modal_projectorr   r2   r   from_configr   audio_towerr  r   embed_visionembed_audior  s     rc   rY   zGemma3nModel.__init__  sw    &*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\]r{   pixel_valuesr   c                 t   | j                  |dd      j                  }|j                  |j                  d   | j                  j
                  j                  | j                  j                        j                  ddd      }|| j                  j
                  j                  dz  z  }| j                  |      S )	a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        FT)r'  r   return_dictr   r   rS   r   r  )
vision_towerr  r  r  r   r   r3   r   r  r%  )r^   r'  vision_outputss      rc   get_image_featureszGemma3nModel.get_image_features  s     **%%T + 


 	
 (//  #KK%%11KK44
 '!Q
	 	 	$++33??DD  ~ >>r{   r  r  image_featuresaudio_featuresc                    || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|R||   j                         |j                         k7  r.t        d| d|j                  d   |j                  d   z         |j                         }|j                  d      j                  |      j                  |j                        }|R||   j                         |j                         k7  r.t        d| d|j                  d   |j                  d   z         ||fS )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        rN  r   z6Image features and image tokens do not match: tokens: z, features r   rS   z6Audio features and image tokens do not match: tokens: )get_input_embeddingsr   r   r   r   longr   allr   r  r   	expand_asr   numelr\   r  )	r^   r  r  r.  r/  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            rc   get_placeholder_maskz!Gemma3nModel.get_placeholder_mask  sl    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;.4,,.LL!;!;5::VcVjVjk c"g  "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%-8J*K*Q*Q*SWeWkWkWm*mHHXXcdrdxdxyzd{  M  S  S  TU  V  eV  dW  X  ,//1/99"=GGVYYZgZnZno%-8J*K*Q*Q*SWeWkWkWm*mHHXXcdrdxdxyzd{  M  S  S  TU  V  eV  dW  X  "#555r{   input_featuresr  input_features_maskr~  r  token_type_idsr  labelsr>   r  r  c                  	   |du |
duz  rt        d      ||n| j                  j                  }||n| j                  j                  }|1 | j	                         |      }
t        j                  |dk\  || j                  k        }t        j                  ||t        j                  |            }| j                  j                  |      }t        j                  || j                  j                  k\  || j                  j                  k        }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j%                  d      j'                  |
      }t        j                  |||
      }
|| j                  j                  k\  }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j%                  d      j'                  |
      }t        j                  |||
      }
nd}|`| j)                  |      }|j!                  |
j"                  |
j*                        }| j-                  ||
|      \  }}|
j/                  ||      }
|4|1| j1                  ||       \  } }t        j2                  | j                  dz
  ggt
        j4                  | j"                        }!| j                  |!      }"t        j                  |j%                  d      |"|       } | j6                  \  }#}$}%| j                  j8                  |$z
  }&|"j;                  |#|&|%      }'t        j<                  | |'fd	      } | j!                  |
j"                  |
j*                        } | j-                  ||
| 
      \  }}(|
j/                  |(|       }
 | j                  dd|||||
|||d|	d|})t?        |)j@                  |r|)jB                  nd|)jD                  |)jF                  |nd|       S d      S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   rS   )r  r   )r  r.  rN  r   )r  r/  T)r  r  r  r~  r  r  r>   r  r  r)  r  )r  r  rR  r  image_hidden_statesr   rX   )$r\   r   r  r  r1  r   rl  r2   rn  
zeros_likelanguage_modelr  r%  r   r&  r1   r   r   r   r4  r-  r   r:  masked_scatterget_audio_featuresr   r2  r  r   r(  r   r   r  r  rR  r  )*r^   r  r'  r;  r  r<  r~  r  r=  r  r  r>  r>   r  r  	lm_kwargsper_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr.  r6  r!  r/  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr7  r  s*                                             rc   r   zGemma3nModel.forward 	  s   ^ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	  7D557	BM %*$5$5i1niRVRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++T..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM#.#8#8#<#F#F}#U !KK(<m][M #d&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL","6"6r":"D"D]"S!KK(;\=YM# #!44\BN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I)-)@)@ReQe)f&NJ "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%$%% 
-)%+'/!5)
 
 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r{   c                 T    | j                  ||      \  }}| j                  |      |fS )a-  
        Projects the last hidden state from the audio encoder into language model space.

        Args:
            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
               The tensors corresponding to the input audio.
            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
               The attention mask for the input audio.

        Returns:
            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
        r*  )r$  r&  )r^   r;  r<  audio_outputsrM  s        rc   rD  zGemma3nModel.get_audio_features	  s5     %)$4$4^EX$Y!zm<jHHr{   c                     t        d      )NzWe don't want to inherit itAttributeErrorr^   super_kwargss     rc   _update_causal_maskz Gemma3nModel._update_causal_mask	  s    :;;r{   )NNNNNNNNNNNNNN)rq   rr   rs   r  r   rY   r   r   r-  r  r   r:  r   r   r   listr
   ry   r   r   r   rD  r`  r   r   s   @rc   r   r     s.    &("^} ^?u|| ? ?2(6##(6 (((6 ))	(6
 ))(6T  15486:156:37KO595959-1$(,0/3I
E,,-I
 u001I
 !!2!23	I

 !.I
 &ell3I
 u//0I
 "%U->->(?(F"GHI
 !!1!12I
 !!1!12I
   1 12I
 ))*I
 D>I
 $D>I
 'tnI
" 
'#I
 I
VI#llIAFI	u||U\\)	*I$<r{   r   z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            #       @    e Zd Zi ZdZed        Zed        Zee		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
eeej                     ef      de
ej                     de
ej                     de
ej                     de
ej                     de
e   de
e   de
e   deeej                  f   def d              Z	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Z xZS )Gemma3nForConditionalGenerationr  c                 .    | j                   j                  S r   )r  r$  r^   s    rc   r$  z+Gemma3nForConditionalGeneration.audio_tower	  s    zz%%%r{   c                     t        d      )Nz2Use embed_vision instead of multi_modal_projector.r\  re  s    rc   r"  z5Gemma3nForConditionalGeneration.multi_modal_projector	  s    QRRr{   r  r'  r;  r  r<  r~  r  r=  r  r  r>  r>   r  r  logits_to_keepr   c                    ||n| j                   j                  }||n| j                   j                  } | j                  d	|||||||||	|
||||dd|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                   j                         j                  x}||z  }t        j                  |      }||z  }d}|O|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                         }||j                  |j                         dk7     j#                         }||j                  |j                         dk7     j#                         }n |j#                         }|j#                         }t%        j&                         }|j)                  d| j                   j*                  j,                        }|j)                  d      j                  |j                         } |||      }t/        |||j0                  |j2                  |j4                  |j6                  |j8                        S )
al  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r'  r;  r  r<  r~  r  r=  r  r  r>  r>   r  r  r)  .r   rS   r   )lossr  r  rR  r  r@  r   rX   )r   r  r  r  r  rZ   rv   r  lm_headget_text_configrI   r   rm  rx   r  r   r   rT  r   CrossEntropyLossrk  r   r1   r   r  rR  r  r@  r   )r^   r  r'  r;  r  r<  r~  r  r=  r  r  r>  r>   r  r  rg  rE  r  rR  slice_indicesr  rI   ri  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                rc   r   z'Gemma3nForConditionalGeneration.forward	  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)) 3%+))'/!5
  !
&  118B>SV8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D,#33!//)) ' ; ; ' ; ;
 	
r{   c                 h    t        |   |f||||||||
d|}|d   dk(  r||d<   ||d<   |	|d<   |S )N)r  r  r  r~  r  r>   rg  r=  r   r'  r;  r<  )r   prepare_inputs_for_generation)r^   r  r  r  r  r~  r'  r;  r  r<  r=  r>   rg  r>  r_   model_inputsr   s                   rc   ru  z=Gemma3nForConditionalGeneration.prepare_inputs_for_generation9
  ss    $ w<
+')%)))
 
  !!+7L(-;L)*2EL./r{   c                     t        d      )NzSDo not inherit _prepare_4d_causal_attention_mask_with_cache_position from PaliGemmar\  r^  s     rc   5_prepare_4d_causal_attention_mask_with_cache_positionzUGemma3nForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_positionb
  s    rssr{   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNN)rq   rr   rs   r  r  propertyr$  r"  r   r   r   r   r  r   r   r   ra  r
   ry   rv   r   r   ru  rx  r   r   s   @rc   rc  rc  	  s    &("& & S S  15486:156:37KO595959-1$(,0/334!A
E,,-A
 u001A
 !!2!23	A

 !.A
 &ell3A
 u//0A
 "%U->->(?(F"GHA
 !!1!12A
 !!1!12A
   1 12A
 ))*A
 D>A
 $D>A
 'tnA
  c5<</0!A
$ 
'%A
  A
L  'Rtr{   rc  )
r}   r!  r   r  rc  r   r  r/   r  r   )NrS   )ir  r   collections.abcr   r   typingr   r   r   r   torch.nnr   torch.nn.functionalr  r  rK  r	   cache_utilsr
   r   r   configuration_utilsr   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   autor   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r    r!   r"   gemma3.modeling_gemma3r#   r$   r%   r&   r'   r(   paligemma.modeling_paligemmar)   r*   r+   r,   'timm_wrapper.configuration_timm_wrapperr-   
get_loggerrq   r   r/   r}   r   r   r   r   r   Moduler   r/  r  r  r  r  r  r  r  r!  r3  r7  r@  r]  r}  r   rv   r  r  r  r  r  r  r  r   rc  __all__rX   r{   rc   <module>r     s       . ' '     ! B B J R B 7 9 F & R R  6    H 
		H	%wG&6 wGtH;) H;VB)+ B)Jv3$ v3r<!= <&<$C <.!] !0g)BII g)T]BII ]@j,bii j,Z@7		 @7FF")) FRORYY O8Dryy D2(ryy (V 6E-/ E-V	%B 	;RYY ;$5Y 5@^'ryy ^'B	!6 	 ,0.||.	. 
. 5<<(	.
 .<Y)? Y)xR0 Rj52 5 abi
 i
 ci
X ^_ *   ` 
/B		 /Bd m<> m<m<` {t&G {t{t|r{   