
    rh                    :   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmc mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6  e/jn                  e8      Z9e e-d       G d de                     Z:e e-d       G d de+                    Z; G d dejx                        Z= G d dejx                        Z> G d  d!ejx                        Z? G d" d#ejx                        Z@ G d$ d%ejx                        ZA G d& d'ejx                        ZB G d( d)ejx                        ZC G d* d+ejx                        ZD G d, d-ejx                        ZE G d. d/ejx                        ZF G d0 d1e'      ZG G d2 d3ej                        ZI G d4 d5ejx                        ZJ G d6 d7ejx                        ZK G d8 d9ejx                        ZL G d: d;ejx                        ZMd< ZNd=e
j                  d>ePd?e
j                  fd@ZQ	 	 	 dedAejx                  dBe
j                  dCe
j                  dDe
j                  dEee
j                     dFeRdGeeR   dHeeR   d?eSe
j                  e
j                  f   fdIZT	 	 dfdJe
j                  dKe
j                  dLe
j                  dMee
j                     dNePf
dOZU G dP dQejx                        ZV G dR dSe      ZWe- G dT dUe'             ZX e-dV       G dW dXeX             ZY e-dY       G dZ d[eXe             ZZ G d\ d]ejx                        Z[ e-d^       G d_ d`eX             Z\ e-da       G db dceXe             Z]g ddZ^y)g    N)CallableSequence)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCacheSlidingWindowLayer)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigzL
    Base class for Gemma3n outputs, with hidden states and attentions.
    )custom_introc                   b    e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   y)Gemma3nModelOutputWithPasta   
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)
__name__
__module____qualname____doc__r(   r   torchFloatTensor__annotations__r)        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr'   r'   2   s5     8<%"3"34;7;%"3"34;r2   r'   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   F   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                     ef      ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeej                     ed	<   y)
Gemma3nCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr(   r)   )r*   r+   r,   r-   r6   r   r.   r/   r0   r7   r8   r   listr
   r9   tupler:   r(   r)   r1   r2   r3   r5   r5   M   s    & )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;7;%"3"34;r2   r5   c                   r     e Zd Zd
dededef fdZd Zdej                  dej                  fdZ
d	 Z xZS )Gemma3nRMSNormdimeps
with_scalec                     t         |           || _        || _        | j                  r.t	        j
                  t        j                  |            | _        y | j                  dt        j                  d      d       y )Nweight      ?F
persistent)super__init__r@   rA   nn	Parameterr.   onesrC   register_buffertensor)selfr?   r@   rA   	__class__s       r3   rH   zGemma3nRMSNorm.__init__r   sY    $??,,uzz#7DK  5<<+< Or2   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr   T)keepdim)r.   sqrtpowmeanr@   )rN   xs     r3   _normzGemma3nRMSNorm._norm|   s4    5::aeeAhmmBm=HIIIr2   rV   returnc                     | j                  |j                               | j                  j                         z  }|j                  |      S N)rW   floatrC   type_as)rN   rV   outputs      r3   forwardzGemma3nRMSNorm.forward   s9     AGGI&):):)<<~~a  r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r<   rC   shaper@   rN   s    r3   
extra_reprzGemma3nRMSNorm.extra_repr   s'    ))*+6$((<<r2   )gư>T)r*   r+   r,   intr[   boolrH   rW   r.   Tensorr^   rb   __classcell__rO   s   @r3   r>   r>   q   sG    PC Pe P PJ! !%,, !=r2   r>   c                       e Zd Zdef fdZdej                  dej                  dej                  fdZdej                  de	d	e	d
e	de	de	de	dej                  fdZ
dej                  dej                  dej                  fdZ xZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                 R   t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j                  | j                  z  | _        t        d| j                  j                  dz
        | _
        | j                  j                  | _        t        j                  | j                  | j                  | j                  z  d      | _        d}d}| j                  dz  }t!        j"                  t%        |      t%        |      z        t        |dz
  d      z  }|t'        j(                  t'        j*                  |      | z        z  }| j-                  d|j%                         j/                  d      j/                  d      d	       y )
Nr   r    FbiasrD   g     @r   inv_timescalesrE   )rG   rH   rj   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrI   Linearpos_projmathlogr[   r.   exparangerL   	unsqueeze)rN   rj   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrn   rO   s          r3   rH   z.Gemma3nAudioRelativePositionEmbedding.__init__   sL   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r2   positiondtyperX   c                 P   |j                         j                  d      }|| j                  j                  |j                  t
        j                        z  }t        j                  t        j                  |      t        j                  |      gd      }|j                  |      S )NrQ   )devicer   r?   )r[   r   rn   tor   r.   float32catsincostype)rN   r   r   scaled_timetiming_signals        r3   _get_timing_signal_1d_posz?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s}    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r2   term_bd_before_shift
batch_sizerp   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     |dz   |z
  }d|f}	t         j                  j                  ||	      }
|
j                  |||||dz   z  f      }|ddddddd||z  f   }|j                  |||||f      }|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r    r   N)rI   
functionalpadreshape)rN   r   r   rp   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r3   _relative_shiftz5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?O
 *11  $4q$89	
 *!Q3X5EHX5X3X*XY )00   
 r2   querieskeysc           	      R   |j                   \  }}}}}|j                   \  }}}	}}t        j                  | j                  | j                   dz
  d|j
                        j                  d      }
|
j                   d   }| j                  |
|j                        }| j                  |      }|j                  d|| j                  | j                        j                  d      }|j                  ddddd      }|j                  ddddd      }t        j                  ||      }|j                  ddddd      }|j                  ddd      }|j                  ||||z  |      }t        j                  ||      }|j                  |||||      }| j!                  ||||||	|      }||z   S )	Nr    rQ   r   r   r   r   r      )r`   r.   r~   rv   rx   r   r   r   r   rz   r   rp   rs   squeezepermutematmulr   )rN   r   r   r   r   r   rp   rs   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r3   r^   z-Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
$&6	8'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >w}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
I?ORb?bdlm

 #(,,z:"F 3::
 ..
 ((r2   )r*   r+   r,   r!   rH   r.   re   r   r   rc   r   r^   rf   rg   s   @r3   ri   ri      s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L)r2   ri   c                   $    e Zd Zdef fdZdej                  dededej                  fdZdej                  dej                  fd	Z	dej                  dej                  fd
Z
dej                  dej                  dej                  fdZ xZS )Gemma3nAudioAttentionrj   c                    t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j
                  | j                  z  | _        | j                  j                  | _        | j                  j                  | _
        t        d| j                  j                  dz
        | _        | j                  j                  | _        | j                  | j                  z   | j                  z   | _        t#        |      | _        t'        j(                  t+        j,                  | j                  f            | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        | j                  dz  }dt*        j&                  j8                  j;                  t+        j<                  d            z  }| j?                  d||z  jA                         jC                         d	       t+        jD                  t+        jF                  | j                   | j                  ft*        jH                  
      d      jJ                  }t+        jD                  t+        jF                  | j                  | j                   ft*        jH                  
      | j                  | j                  z         }t+        jF                  | j                  | j                   ft*        jH                  
      }||z  |z  }| j?                  d|d	       | j?                  dt+        j<                  | j                        jM                         d	       y )Nr   r    Frl         rD           q_scalerE   r   )diagonallocal_causal_valid_masksoftcap)'rG   rH   rj   ro   rp   rq   rs   conf_attention_chunk_size
chunk_sizerw   max_future_horizonrt   ru   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizeri   relative_position_embeddingrI   rJ   r.   zerosper_dim_scalery   q_projk_projv_projr   softplusrM   rL   clonedetachtrilrK   rd   Tr[   )rN   rj   r   r_softplus_0lower_causal_maskupper_causal_maskr   rO   s          r3   rH   zGemma3nAudioAttention.__init__7  s   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY<)?(F(F(H(O(O(Q^cd!JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9<M"MPa"a68O\abLL778>>@ 	 	
r2   rV   pad_left	pad_rightrX   c                     |j                   ^}}}|j                  ||g|      }|j                  ||g|      }t        j                  |||gd      }|S )Nr    r   )r`   	new_zerosr.   r   )	rN   rV   r   r   batchr   
tail_shapeleftrights	            r3   	_pad_dim1zGemma3nAudioAttention._pad_dim1b  s^     !q:{{E89j9:UI;
;<IItQ&A.r2   r9   c                 (   |j                   }|dd \  }}|| j                  z   dz
  | j                  z  }|| j                  z  |z
  x}dkD  r| j                  |d|      }||| j                  f|dd z   }|j                  |      j	                         }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr   r    r   )r`   r   r   r   
contiguous)rN   r9   r`   bt
num_blockspadding_lenpermute_dimss           r3   _convert_to_blockz'Gemma3nAudioAttention._convert_to_blocki  s     ##Ray1$//)A-$//A
%7!;;Kq@ NN=![IM:t7%)C%--l;FFHr2   c                 \   | j                   }| j                  | j                  z   dz
  }| j                  |||      }| j                  }| j                  }|j                  d||      }|j                  dkD  r'|j                  dkD  rt        j                  |dd      }|j                         S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r    )	dimensionsizestepr   r   rQ   )sourcedestination)
r   r   r   r   r   unfoldndimr.   movedimr   )rN   r9   r   r   	frame_len
frame_step
x_unfoldeds          r3   _extract_block_contextz,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}h	J%%	__
 #))AIJ)W
 !joo&9 z"!LJ$$&&r2   maskc                 	   g |j                   d d | j                  | j                  }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }t        j                  j                  j                  | j                        }ddd| j                  f}|j                  |      }	|| j                  z  |	z  }|j                   d d \  }
}| j                  |      }| j!                  |      }| j!                  |      }|j                   d   }| }| j!                  |      }|j"                  dk(  rI|j                   d   |j                   d   z  | j$                  k(  r|j	                  |
|| j$                        }|j                   |
|| j$                  fk7  r,t'        d|j                    d|
 d| d| j$                   d		      |j)                  d      j)                  d
      }| j*                  j)                  d      j)                  d      j)                  d      }t        j,                  ||j/                  |j0                              }| j3                  ||      }| j4                  j/                  |j0                        }||z  }t        j6                  |      }||z  }t        j8                  ||t        j:                  |j<                        j>                        }t        j                  j                  jA                  |dt        jB                        j/                  |j<                        }|j                   \  }}}}}|j                   d   }|jE                  ddddd      j	                  d||      }|jE                  ddddd      j	                  d||      }t        jF                  ||      } | j	                  |||||      jE                  ddddd      }!|!j	                  |
|| jH                  z  | j                  | j                  f      }!|!d d d |f   }!|!S )NrQ   r    r   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   r?   r   r   )%r`   rp   rs   r   r   r   r   r   r.   rI   r   r   r   viewr   r   r   r   r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rN   r9   r   	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer7   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r3   r^   zGemma3nAudioAttention.forward  sT   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#dll25OO)//3
F--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*M,d.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,
K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 4??2	
 *!WfW*5r2   )r*   r+   r,   r!   rH   r.   re   rc   r   r   r   
BoolTensorr^   rf   rg   s   @r3   r   r   6  s    )
1 )
V5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell dr2   r   c                   r     e Zd ZdZ	 d	dedee   def fdZdej                  dej                  fdZ
 xZS )
Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    num_channelsfeature_dimsr@   c           	         t         |           || _        t        |      | _        || _        t        j                  t        j                  |            | _
        t        t        ddt        | j                        z   dz               | _        y )Nr   r    )rG   rH   r!  r<   r"  r@   rI   rJ   r.   rK   rC   rangelenreduction_axes)rN   r!  r"  r@   rO   s       r3   rH   z(Gemma3nAudioCumulativeGroupNorm.__init__'  sr     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr2   r9   rX   c                    | j                   | j                  fz   }|j                  dd |k7  rt        d|j                  dd  d|       |j                  }t
        j                  }|j                  |      }t        j                  ||      }t        j                  || j                  d      }t        j                  |d	      }t        j                  || j                  d      }	t        j                  |	d	      }
t        j                  |
d
      }||z  }||z
  j                  d      }t        j                  || j                  d      }t        j                  |d	      }||z  }||z
  t        j                  || j                  z         z  }| j                   j                  |      }dg|j#                         dz
  z  | j                  gz   }||j%                  |      z  }||z  }|j                  |      S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   Tr?   rR   r    r   rD   )r   )r"  r!  r`   r   r   r.   r   r   	ones_likesumr&  cumsumclamprT   rsqrtr@   rC   r?   r   )rN   r9   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r3   r^   z'Gemma3nAudioCumulativeGroupNorm.forward9  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF*=	  ))F0C0CTRo1= "'9$:M:MW[!\"\\*@aH"'++.@c"J "$;;
 #)8"3!8!8!; 99%;ATAT^bc  ,,'7Q? ')@@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r2   )gMbP?)r*   r+   r,   r-   rc   r   r[   rH   r.   re   r^   rf   rg   s   @r3   r   r     sT    ( 	NN smN 	N$G,U\\ G,ell G,r2   r   c                   ~     e Zd ZdZ	 d
dedededeeeeef   f fdZdej                  dej                  fd	Z
 xZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    rj   idxinput_freq_dimmanual_paddingc                 J   t         |           || _        || _        |dk(  rdn| j                  j                  |dz
     }| j                  j                  |   }| j                  j
                  |   \  }}| j                  j                  |   \  }	}
t        j                  ||||f|	|
fdd      | _	        || j                  d   z   | j                  d   z   }||z
  |
z  dz   }t        ||f| j                  j                        | _        t        j                         | _        y )Nr   r    )r   r   F)in_channelsout_channelskernel_sizestridepaddingrm   )r!  r"  r@   )rG   rH   rj   rE  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerI   Conv2dconvr   sscp_conv_group_norm_epsnormReLU
activation)rN   rj   rC  rD  rE  rG  rH  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrO   s                r3   rH   z"Gemma3nAudioSSCPConvBlock.__init__  s%    	, !8a)K)KCRSG)T{{99#>![[>>sC(![[>>sC(II#% h'

	 %t':':1'==@S@STU@VV!H,9A=
3%$44
	 '')r2   audio_encodingsrX   c                 6   t        j                  || j                  dd      }| j                  |      }|j	                  dddd      j                         }| j                  |      }|j	                  dddd      j                         }| j                  |      S )Nconstantr   )modevaluer   r   r   r    )Fr   rE  rP  r   r   rR  rT  )rN   r[  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r3   r^   z!Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r2   ))r   r   r   r   )r*   r+   r,   r-   r!   rc   r<   rH   r.   re   r^   rf   rg   s   @r3   rB  rB    sc     5A)$")$ )$ 	)$
 c3S01)$V7u|| 7 7r2   rB  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )#Gemma3nAudioSubSampleConvProjectionrj   c                 p   t         |           || _        |j                  }g }g }t	        d      D ]n  }|j
                  |   \  }}|j                  |   \  }}	d}
|dz
  }d}d}|||
|f}|j                  |       ||z   |z   }||z
  |	z  dz   }|j                  |       |}p t        d|j                  ||d         | _	        t        d|d   ||d         | _
        |j                  d   }|d   }||z  | _        t        j                  | j                  | j                  j                  d      | _        y )Nr   r   r    )rC  rD  rj   rE  rQ   Frl   )rG   rH   rj   input_feat_sizer$  rM  rN  appendrB  conv_0conv_1rL  input_proj_in_featuresrI   ry   rq   input_proj_linear)rN   rj   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsirU  rV  rW  rX  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplerY  f_out_after_convfinal_c_outfinal_f_outrO   s                      r3   rH   z,Gemma3nAudioSubSampleConvProjection.__init__  s   $*$:$:!#%  "q 	9A!'!=!=a!@Hh!'!=!=a!@Hh I#a<L JK 	$  %++,@A 4j@;NK +h 68CaG!(()9:(8%=	9@ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr2   r[  rX   c                     |j                  d      }| j                  |      }| j                  |      }|j                  \  }}}}|j	                  dddd      j                         }|j                  ||||z        }	| j                  |	      }
|
S )Nr    r   r   r   )r   rk  rl  r`   r   r   r   rn  )rN   r[  audio_encodings_reshapedrV   r   c_outt_outf_out
x_permutedoutput_flattenedr]   s              r3   r^   z+Gemma3nAudioSubSampleConvProjection.forward   s     $3#<#<Q#? KK01KKN!"5%YYq!Q*557
%??1eUU]C''(89r2   	r*   r+   r,   r!   rH   r.   re   r^   rf   rg   s   @r3   rg  rg    s.    7m1 7mru||  r2   rg  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerAttentionrj   c                    t         |           || _        | j                  j                  | _        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _
        t        |      | _        t        j                  | j                  | j                  j                  d      | _        t        | j                  j                        | _        y )Ngradient_clippingFrE   rl   )rG   rH   rj   rq   post_in_featuresrL   r.   rM   r  r>   pre_attn_normr   attnrI   ry   post	post_normrN   rj   rO   s     r3   rH   z'Gemma3nAudioConformerAttention.__init__  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r2   r[  audio_mel_maskrX   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  ||      }|j
                  \  }}}}	|j                  ||||	z        }
| j                  |
      }t        j                  || j                   | j                        }|| j                  |      z   S rZ   )	r.   r,  r  r  r  r`   r   r  r  )rN   r[  r  audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rp   rs   r|  s              r3   r^   z&Gemma3nAudioConformerAttention.forward  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A>#R  %=$B$B!1i#;#C#CAq)V^J^#_ ))$<=++o8N8N7NPTPfPfg,t~~o/NNNr2   
r*   r+   r,   r!   rH   r.   re   r  r^   rf   rg   s   @r3   r  r    sA    A1 AOu|| OUEUEU OZ_ZfZf Or2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerFeedForwardrj   c                    t         |           || _        | j                  dt	        j
                  | j                  j                        d       t        | j                  j                        | _	        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  dz  | j                  j                  d      | _        t        | j                  j                        | _        t	        j
                  | j                  j                        | _        y )Nr  FrE   r   rl   )rG   rH   rj   rL   r.   rM   r  r>   rq   pre_layer_normrI   ry   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r3   rH   z)Gemma3nAudioConformerFeedForward.__init__,  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF %T[[-M-M Nr2   r[  rX   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  |      }t
        j                  j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }||| j                  z  z   S rZ   )r.   r,  r  r  r  rI   r   silur  r  r  )rN   r[  residuals      r3   r^   z(Gemma3nAudioConformerFeedForward.forward8  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..??T-B-BBCCr2   r  rg   s   @r3   r  r  +  s0    
O1 
O	Du|| 	D 	Dr2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerLightConv1drj   c           	         t         |           || _        t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  dz  d      | _	        t        j                  | j                  j                  | j                  j                  | j                  j                  dd| j                  j                  d      | _        | j                  dt        j                  | j                  j                         d	       t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  d      | _        | j                  j                  dz
  | _        y )
Nr@   r   Frl   r    r   )rG  rH  rI  rJ  rK  groupsrm   r  rE   )rG   rH   rj   r>   rq   rms_norm_epsr  rI   ry   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drL   r.   rM   r  	conv_norm
linear_endcausal_paddingr  s     r3   rH   z)Gemma3nAudioConformerLightConv1d.__init__E  sD   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr2   r[  rX   c                 :   |}| j                  |      }| j                  |      }t        j                  j                  j                  |d      }|j                  ddd      }t        j                  || j                  df      }| j                  |      }|j                  ddd      }t        j                  || j                   | j                        }| j                  |      }t        j                  j                  |      }| j                  |      }||z   }|S )NrQ   r   r   r   r    )r  r  r.   rI   r   glur   r`  r   r  r  r,  r  r  r  r  )rN   r[  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr]   s         r3   r^   z(Gemma3nAudioConformerLightConv1d.forwardZ  s   #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0H4K^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: #;;r2   r  rg   s   @r3   r  r  D  s-    D1 D*u||  r2   r  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerBlockrj   c                    t         |           || _        t        | j                        | _        t        | j                        | _        t        | j                        | _        t        | j                        | _	        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _        y )Nr  FrE   )rG   rH   rj   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endrL   r.   rM   r  r>   rq   rR  r  s     r3   rH   z#Gemma3nAudioConformerBlock.__init__p  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r2   r[  r  rX   c                 j   | j                  |      }| j                  ||      }| }||j                  d      j                  |j                        z  }| j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }|S )NrQ   )r  r  r   r   r   r  r  r.   r,  r  rR  )rN   r[  r  validity_mask_for_lconvaudio_encodings_for_lconv_inputr]   s         r3   r^   z"Gemma3nAudioConformerBlock.forward{  s    ..?...I#1/*9<S<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r2   r  rg   s   @r3   r  r  o  s;    	<1 	<u|| UEUEU Z_ZfZf r2   r  c                        e Zd ZU dZeed<   dZdef fdZdej                  dej                  deej                  ej                  f   fdZ xZS )Gemma3nAudioEncoderzfAn audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.rj   	audio_melc                     t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _
        y c c}w rZ   )rG   rH   rj   rg  subsample_conv_projectionrI   
ModuleListr$  conf_num_hidden_layersr  	conformer)rN   rj   r   rO   s      r3   rH   zGemma3nAudioEncoder.__init__  sV     )LV)T&9>v?\?\9]^A'/^
^s   A-r  rX   c                 ,   | j                  |      }|j                  d   }d}t        t        | j                  j
                              D ]!  }|| j                  j
                  |   d   z  }# t        j                  ||j                        |z  }t        j                  ||j                  d   dz
        }|j                  dkD  r>|j                  dk(  r/|j                  d      j                  |j                  d   d      }n`|j                  |j                  k(  rG|j                  d   dk(  r5|j                  d   dk7  r#||j                  d   k(  r|j                  d      }t        j                  |d|      }| j                  D ]  }	 |	||      } | j                  j                  dkD  r@|dddd| j                  j                  f   }|dddd| j                  j                  f   }|j!                  |j                  d      d      }||fS )a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r    r   r   )rt   rQ   Nr   )r  r`   r$  r%  rj   rN  r.   r~   r   r,  r   r   expandgatherr  conf_reduction_factormasked_fill)
rN   r  r  r[  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks
             r3   r^   zGemma3nAudioEncoder.forward  s    88C  %%a($S)J)J%KL 	YO4;;#D#D_#UVW#XX	Y ,,u^-B-BCFYY++g>+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^ 	CE#O\BO	C ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV,,r2   )r*   r+   r,   r-   r!   r0   main_input_namerH   r.   re   r  r<   r^   rf   rg   s   @r3   r  r    sY    p!O
1 
5-5-7<7G7G5-	u||U---	.5-r2   r  c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 v    t         |   |||       | j                  dt        j                  |      d       y )Nr  FrE   )rG   rH   rL   r.   rM   )rN   r  r  r  r  rO   s        r3   rH   z'Gemma3nTextScaledWordEmbedding.__init__  s3    D]ELL,ERWXr2   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S rZ   )rG   r^   r  r   rC   r   )rN   r  rO   s     r3   r^   z&Gemma3nTextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr2   )rD   )r*   r+   r,   r-   rc   r[   rH   r.   re   r^   rf   rg   s   @r3   r  r    sG    Ys Y3 YS Y_d YS S Sr2   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )Gemma3nTextLaurelBlockz Learned Augmented Residual Layerrj   c                    t         |           || _        t        j                  | j                  j
                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j
                  d      | _        t        | j                  j
                  | j                  j                        | _        y )NFrl   r  )rG   rH   rj   rI   ry   rq   laurel_ranklinear_leftlinear_rightr>   r  post_laurel_normr  s     r3   rH   zGemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er2   r9   rX   c                 r    | j                  |      }| j                  |      }| j                  |      }||z   S rZ   )r  r  r  )rN   r9   laurel_hidden_statesnormed_laurel_hidden_statess       r3   r^   zGemma3nTextLaurelBlock.forward  sC    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#:::r2   )
r*   r+   r,   r-   r#   rH   r.   re   r^   rf   rg   s   @r3   r  r    s0    *f0 f;U\\ ;ell ;r2   r  c                        e Zd Zd	dedef fdZdej                  dej                  fdZdej                  dej                  fdZ	 xZ
S )
Gemma3nTextMLPrj   	layer_idxc                    t         |           || _        |j                  | _        |j                  |   | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        |j                  |   | _        y NFrl   )rG   rH   rj   rq   intermediate_sizerI   ry   	gate_projup_proj	down_projr	   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrN   rj   r  rO   s      r3   rH   zGemma3nTextMLP.__init__  s    !--!'!9!9)!D4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556#)#E#Ei#P r2   r9   rX   c                     | j                  |      }| j                  dkD  r| j                  |      }| j                  |      }| j	                  |      }| j                  ||z        }|S )Nr   )r  r  _gaussian_topkr  r  r  )rN   r9   r  activationsr  r  s         r3   r^   zGemma3nTextMLP.forward  sc    NN=1	##c)++I6Ikk),,,}-NN;#89	r2   inputsc                    t        j                  | j                  t         j                  |j                        }t         j
                  j                  j                  dd      }|j                  |      }|j                  |j                        }t        j                  |dd      }t        j                  |ddd      }|||z  z   }t        j                  j                  ||z
        S )	Nr   r   r   r    rQ   Tr(  F)r?   rR   unbiased)r.   rM   r  r   r   distributionsnormalNormalicdfr   r   rU   stdrI   r   relu)rN   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r3   r  zGemma3nTextMLP._gaussian_topk  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&8"344r2   )r   )r*   r+   r,   r#   rc   rH   r.   re   r^   r  rf   rg   s   @r3   r  r    sP    	Q0 	QS 	QU\\ ell 5U\\ 5ell 5r2   r  c                   X    e Zd ZdZdef fdZdej                  dej                  fdZdej                  dej                  fdZ	d	ej                  d
ej                  dej                  fdZ
dej                  dej                  fdZdej                  dej                  fdZ xZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    rj   c                 F   t         |           || _        t        j                  t        j                  | j                  j                              | _        t        j                  | j                  j                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  | j                  j                  d      | _        t        | j                  j                  | j                  j                        | _        | j#                  dt        j$                  | j                  j                  dz        d       y )NFrl   r   r  router_input_scaleg      rE   )rG   rH   rj   rI   rJ   r.   r   rq   correct_output_scalery   altup_num_inputscorrection_coefsprediction_coefsmodality_routerr>   r  router_normrL   rM   r  s     r3   rH   zGemma3nTextAltUp.__init__#  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr2   rV   rX   c                     | j                  |      | j                  z  }| j                  |      }t        j                  |j                               j                  |      S rZ   )r  r  r  r.   r   r[   r\   )rN   rV   router_inputsrouteds       r3   compute_router_modalitiesz*Gemma3nTextAltUp.compute_router_modalities-  sM    ((+d.E.EE%%m4zz&,,.)11!44r2   r9   c                    | j                  || j                  j                           }| j                  ro| j                  j                  Y| j
                  j                  j                  j                  | j                  j                   | j                  j                          | j                  |      j                  g |j                  dd | j                  j                  | j                  j                   j                  dddd      }t        j                  |j                  dddd      |      }|j                  dddd      }||z  }|j                         j!                  |      S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrQ   r   r    r   r   )r  rj   altup_active_idxtrainingaltup_coef_clipr
  rC   dataclamp_r   r`   r  r   r.   r   r   r\   )rN   r9   
modalities	all_coefspredictionss        r3   predictzGemma3nTextAltUp.predict2  s@    33M$++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSnoD!!*-Wi &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15}$%%'//>>r2   r  	activatedc                 j   | j                  |      }||| j                  j                     z
  }|j                  | j                  j                  ddd      }| j                  j
                  Y| j                  j                  j                  j                  | j                  j
                   | j                  j
                         | j                  |      dz   }|j                  ddd      j                  d      }t        j                  ||      }||z  }|j                         j                  |      S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r    rD   r   r   rQ   )r  rj   r  repeatr  r  r	  rC   r  r  r   r   r.   mulr   r\   )rN   r  r  r  
innovationr  	correcteds          r3   correctzGemma3nTextAltUp.correctN  s    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
;;&&2!!((--44dkk6Q6Q5QSWS^S^SnSno
 #'"7"7
"Cc"I	%%aA.88<	IIj)4	[ 	##%--i88r2   r   c                 p    |j                  | j                        | j                  z  j                  |      S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )r\   r  rN   r   s     r3   r^   zGemma3nTextAltUp.forwardk  s2     !!$";";<t?X?XXaabkllr2   c                 $    | j                  |      S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)r^   r#  s     r3   scale_corrected_outputz'Gemma3nTextAltUp.scale_corrected_outputs  s    ||I&&r2   )r*   r+   r,   r-   r#   rH   r.   re   r  r  r!  r^   r%  rf   rg   s   @r3   r  r    s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9:m m%,, m' ' 'r2   r  c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Gemma3nTextRotaryEmbeddingrj   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typer   defaultinv_freqFrE   )rG   rH   hasattr
isinstancer)  dictgetr*  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrj   r   rope_init_fnattention_scalingrL   r,  original_inv_freq)rN   rj   r   r,  rO   s       r3   rH   z#Gemma3nTextRotaryEmbedding.__init__y  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r2   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rQ   r    mpscpuF)device_typeenabledr   r   r   )r,  r[   r  r`   r   r   r.  r   strr.   autocast	transposer   r   r5  r   r   )
rN   rV   position_idsinv_freq_expandedposition_ids_expandedr:  freqsembr   r   s
             r3   r^   z"Gemma3nTextRotaryEmbedding.forward  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.rZ   )
r*   r+   r,   r#   rH   r.   no_gradr   r^   rf   rg   s   @r3   r'  r'  x  s4    /0 /" U]]_<  <r2   r'  c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrQ   r   r   )r`   r.   r   )rV   x1x2s      r3   rotate_halfrH    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   r9   n_reprX   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)r`   r  r   )r9   rI  r   num_key_value_headsslenrs   s         r3   	repeat_kvrM    so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr2   modulequerykeyr_  attention_maskdropoutscalingr   c                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	Nr   r   r   r   rQ   r   )pr  r    )rs   rM  num_key_value_groupsr.   r   r>  r   r`   rI   r   r   r   r   r   rR  r  r   )rN  rO  rP  r_  rQ  rR  rS  r   kwargsr  r  attn_weightscausal_maskattn_outputs                 r3   eager_attention_forwardr[    sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r2   rV   r   r   r?  unsqueeze_dimc                 n    |j                  |      }|j                  |      }| |z  t        |       |z  z   S )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   rH  )rV   r   r   r?  r\  s        r3   apply_rotary_pos_embr^    s8    2 --
&C
--
&CGA,--r2   c                       e Zd ZdZdedef fdZ	 	 ddej                  dej                  de	ej                     de	e
   d	e	ej                     d
ee   deej                  e	ej                     e	eej                        f   fdZ xZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrj   r  c                    t         |           |j                  |   dk(  | _        || _        || _        t        |d|j                  |j                  z        | _	        |j                  |j                  z  | _        | j                  j                  | _        d| _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  | j                  z  |j                  |j                         | _        | j                  r|j*                  nd | _        t-        |j                  |j.                        | _        t-        |j                  |j.                        | _        t-        |j                  |j.                  d      | _        | j                  j6                  | j                  j8                  z
  }||cxk\  xr dkD  nc | _        |j                  |   }| j:                  r0|d	z
  |j                  |d	z
  d d
   j=                  |      z
  | _        y d | _        y )Nsliding_attentionrs   Trl   )r?   r@   F)r?   r@   rA   r   r    rQ   ) rG   rH   layer_types
is_slidingrj   r  getattrrq   num_attention_headsrs   rK  rV  attention_dropout	is_causalrI   ry   attention_biasr   r   r   o_projsliding_windowr>   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerindexkv_shared_layer_index)rN   rj   r  first_kv_shared_layer_idx
layer_typerO   s        r3   rH   zGemma3nTextAttention.__init__  sT    ,,Y7;NN"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 8<f33D$f>Q>QR$f>Q>QR$f>Q>Q^cd$(KK$A$ADKKDdDd$d!"+/H"L1"L''	2
 && &)F,>,>?X[\?\?b`b?b,c,i,ijt,uu 	"  	"r2   r9   position_embeddingsrQ  past_key_valuecache_positionrW  rX   c                 >   |j                   d d }g |d| j                  j                  }|\  }	}
| j                  |      j	                  |      }| j                  |      }t        ||	|
d      }|j                  dd      }| j                  r| j                  ||j                  | j                     }|j                  |j                  j                        }t        |t              r_|j                   d   |j!                         kD  rt#        d|j!                               }n$|j%                  d|j!                         dz
        }|j                  d d d d |f   j                  |j                        }|j&                  d d d d |f   j                  |j                        }n| j)                  |      j	                  |      }| j+                  |      }t        ||	|
d      }|j                  dd      }| j-                  |      j	                  |      }| j/                  |      }|j                  dd      }|2|
|	|| j0                  d}|j3                  ||| j4                  |      \  }}t6        }| j                  j8                  dk7  rt:        | j                  j8                     } || ||||f| j<                  r| j>                  nd	d
| j0                  d|\  }} |j@                  g |d jC                         }| jE                  |      }||fS )NrQ   r   )r\  r    r   )r   rt   )r   r   rx  rk  eagerr   rD   )rR  rS  rk  )#r`   rj   rs   r   r   rl  r^  r>  rq  rs  layersr   r   r   r.  r   get_max_cache_shapeslicer,  valuesr   rm  r   rn  rk  updater  r[  _attn_implementationr   r  rg  r   r   rj  )rN   r9   rv  rQ  rw  rx  rW  input_shapehidden_shaper   r   r  layerr  r  r  cache_kwargsattention_interfacerZ  rX  s                       r3   r^   zGemma3nTextAttention.forward  s    $))#2.??b?$++*>*>?&S{{=166|D{{<0+L#sRST#--a3""t'A'A'MR`Rl"))$*D*DEE$''

(9(9:G%!34!''*U-F-F-HH#Au'@'@'BCG%mmu7P7P7RUV7VmWG Aq'M255l6I6IJJ <<1g699,:M:MNL]388FJZ0J-j#sRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L% "0"&"5"5	L (6'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**..
%
 
%
!\ *k));;;;FFHkk+.L((r2   NN)r*   r+   r,   r-   r#   rc   rH   r.   re   r   r
   
LongTensorr   r   r<   r^   rf   rg   s   @r3   r`  r`    s    G$
0 $
S $
V +/59H)||H) #\\H) !.	H)
 !H) !!1!12H) -.H) 
u||Xell3XeELL>Q5RR	SH)r2   r`  c                   p    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dej                  dej                  deej                     d	eej                     d
ee
   dee   dee   deej                     deej                  eeej                  ej                  f      f   fdZ xZS )Gemma3nTextDecoderLayerrj   r  c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        ||      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        |j"                  | _        t$        |j&                     | _        t+        |      | _        t/        |      | _        t3        j4                  | j                  | j"                  d      | _        t3        j4                  | j"                  | j                  d      | _        t        | j                  |j                        | _        y )N)r  r  Frl   )rG   rH   rj   rq   r  rc  attention_typer`  	self_attnr  mlpr>   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr	   r  r  r  altupr  laurelrI   ry   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  s      r3   rH   z Gemma3nTextDecoderLayer.__init__c  s]   !--"$00;-fi@!&I>-d.>.>FDWDWX(6t7G7GVM`M`(a%)78H8HfNaNa)b&*89I9IvObOb*c'+1+M+M(V556%f-
,V4$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r2   r9   position_embeddings_globalposition_embeddings_localper_layer_inputrQ  r?  rw  output_attentions	use_cacherx  rX   c                    | j                   j                  |      }|| j                  j                     }| j	                  |      }| j                  |      }| j                  j                  r|}n|} | j                  d|||||||	|
d|\  }}| j                  |      }||z   }||z   t        j                  d      z  }| j                  |      }| j                  |      }| j                  |      }||z   }| j                   j                  ||      }|| j                  j                     j                         }| j                  j                   r| j                   j#                  |      }| j%                  |      }| j'                  |      }t)        j*                  ||      }| j-                  |      }| j/                  |      }|dd xxx |z  ccc |f}|r||fz  }|S )N)r9   rv  rQ  r?  rw  r  r  rx  r   r    r1   )r  r  rj   r  r  r  r  rd  r  r{   rS   r  r  r  r!  r   altup_correct_scaler%  r  r  r.   multiplyr  r  )rN   r9   r  r  r  rQ  r?  rw  r  r  rx  rW  r  active_predictionactive_prediction_normedlaurel_outputrv  r  self_attn_weights
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionoutputss                               r3   r^   zGemma3nTextDecoderLayer.forwardy  s    jj((7'(D(DE#'#7#78I#J $<= >>$$";"<"0$.. 
#
2 3)%)/)
#
 
#
 ,,T2&-
!M1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!(*)++Gr2   )NNNFFN)r*   r+   r,   r#   rc   rH   r.   re   r   r  r
   rd   r<   r/   r^   rf   rg   s   @r3   r  r  b  s   c0 cS c8 2637*.,1$)59C||C %*LLC $)<<	C
 C !.C u//0C !C $D>C D>C !!1!12C 
u||XeE,=,=u?P?P,P&QRR	SCr2   r  c                   \     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ fdZ xZS )Gemma3nPreTrainedModelrj    Tr  r8   )r9   r:   c                 d   t         |   |       t        |t              r&|j                  j
                  j                  d       y t        |t              r%|j                  j
                  j                          y t        |t              r%|j                  j
                  j                          y y )NrD   )rG   _init_weightsr.  r   rC   r  fill_r   r   zero_r  r  )rN   rN  rO   s     r3   r  z$Gemma3nPreTrainedModel._init_weights  s~    f%f=>MM$$S) 56  %%++- 01'',,224 2r2   )r*   r+   r,   r"   r0   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r`  _can_record_outputsr  rf   rg   s   @r3   r  r    s\    &*#23#4"5N!"&0*
5 5r2   r  zBThe base Gemma 3n language model without a language modeling head.c                       e Zd ZU eed<   def fdZee	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     dee   dee	j                     d	ee   d
ee   dee   dee	j                     dee   defd              Zde	j                  de	j                  fdZ	 dde	j                  dee	j                     de	j                  fdZ xZS )Gemma3nTextModelrj   c           
         t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        t+        j,                  |      }|j.                  |_        ddi|_        t%        |      | _        |j                  | _        |j6                  | _        t        |j8                  |j                  |j6                  z  | j                  |j6                  dz        | _        t        j<                  | j                  |j                  |j6                  z  d      | _        t        |j6                  |j                         | _         t        j                  t        d	| j                  jB                        D cg c].  }t        j<                  | j                  | j                  d      0 c}      | _"        t        j                  t        d	| j                  jB                        D cg c].  }t        j<                  | j                  | j                  d      0 c}      | _#        | jI                  d
tK        jL                  | j                  dz        d       | jI                  dtK        jN                  tK        jL                  d            d       | jQ                          y c c}w c c}w c c}w )N      ?)r  r  rj   Fr*  r+  rl   r    per_layer_projection_scaler   rE   per_layer_input_scaleg       @))rG   rH   pad_token_idr  
vocab_sizer  rq   rj   embed_tokensrI   r  r$  ro  r  r{  r>   r  rR  r'  
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar)  rotary_emb_localr  vocab_size_per_layer_inputembed_tokens_per_layerry   per_layer_model_projectionper_layer_projection_normr  altup_projectionsaltup_unembed_projectionsrL   r.   rM   r-  	post_init)rN   rj   r  r   rO   s       r3   rH   zGemma3nTextModel.__init__  s    !.. ++ ;v1143C3CQUQ\Q\QhQhjmQm
 mmINvOgOgIhiI$VY7i
 #6#5#56;N;NO	4FC&+#
 v&"77*I6 :& I!--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&!#PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw"
 *,PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 	[ jD x xs   "M03M523M:r  per_layer_inputsrQ  r?  r8   inputs_embedsr  r  output_hidden_statesrx  rW  rX   c                 P	   ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|du |duz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|"| j                  |      }| j                  |      }| j                  ||      }|r|| j                  s
t               }|
F||j                         nd}t        j                  |||j                   d   z   |j"                        }
||
j%                  d      }t'        |x}t(              s*| j                   |||
||d}t+        di |t-        di |d	}|}| j/                  ||      }| j1                  ||      }t        j2                  |d
z  dd      dz  }t        j4                  d      }|g}t7        d| j                   j8                        D ]  } | j:                  |dz
     |      }|j=                  |j>                  |j"                        }t        j2                  |d
z  dd      }t        j@                  t        jB                  ||j=                  |j"                                    }||z  |z  }|jE                  |        t        jF                  |d      }|	rdnd}|rdnd}| jH                  d| j                   jJ                   D ]V  }|	r||fz  }||jL                     }|dddd|jN                  ddf   } |||||f||||||
d|}|d   }|sN||d   fz  }X |	r||fz  }t        j2                  |d   d
z  dd      dz  }|d   g}t7        d| j                   j8                        D ]  } | jP                  |dz
     ||         } | j=                  |j>                  |j"                        }t        j2                  |d
z  dd      }t        j@                  t        jB                  ||j=                  |j"                                    }||z  |z  }|jE                  |        t        jF                  |      }t        j2                  |d      }| jS                  |      }tU        ||||      S )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    r   )rj   input_embedsrQ  rx  r8   r?  )full_attentionrb  r   rQ   Tr(  r  gh㈵>r  r   r1   )rQ  r?  rw  r  r  rx  )last_hidden_stater8   r9   r:   )+rj   r  r  r  r   r  r  loggerwarning_oncer  get_per_layer_inputsproject_per_layer_inputsr   get_seq_lengthr.   r~   r`   r   r   r.  r/  r   r   r  r  rU   rM   r$  r  r  r   r   rS   maximumrj  stackr{  ro  r  r  r  rR  r   )!rN   r  r  rQ  r?  r8   r  r  r  r  rx  rW  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0r  r  target_magnitudeepsilon_tensortemp_hidden_statesrr  
altup_projcurrent_hidden_statenew_magnituder9   all_hidden_statesall_self_attnsdecoder_layerrY  r  layer_outputsaltup_unemb_projs!                                    r3   r^   zGemma3nTextModel.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M#88C88HXY0*nO!CRC^==?de"\\  =#6#6q#99$++N )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%F%U%U# ( &*___l%S"$($9$9/<$X! !::oq&8b$OSVVd+-.q$++667 	<A6//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $6A> #7BD0d![[)H4;;+H+HI 	6M#!m%55!-m.J.JKK.q!]5L5La/OPO)*)	
  +)."3#- M *!,M =#3"551	66  -!11 !::mA&6!&;TRVYY+A./q$++667 	<A-RT-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $67

=a8		-0&+++%	
 	
r2   c                      | j                  |      j                  g |j                  | j                  j                  | j
                   S rZ   )r  r   r`   rj   ro  r  )rN   r  s     r3   r  z%Gemma3nTextModel.get_per_layer_inputs  sP    =t**95== 
__
KK))
 ,,
 	
r2   c                    | j                  |      }|| j                  j                  |j                  |j                        z  } |j
                  g |j                  d d | j                  j                  | j                   }| j                  |      }||S |j                  |j                  k7  r |dd | j                  j                  d d f   }||z   | j                  j                  |j                  |j                        z  S )Nr  rQ   .)r  r  r   r   r   r   r`   rj   ro  r  r  r  )rN   r  r  r  s       r3   r  z)Gemma3nTextModel.project_per_layer_inputs  s.   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  <3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$'774;U;U;X;X%%.B.I.I <Y <
 
 	
r2   )
NNNNNNNNNNrZ   )r*   r+   r,   r#   r0   rH   r   r   r   r.   r  re   r
   r/   rd   r   r   r   r^   r  r  rf   rg   s   @r3   r  r    s   70 7r  15371537+/59$(,0/359T
E,,-T
 #5<<0T
 !.	T

 u//0T
 "%T
   1 12T
 D>T
 $D>T
 'tnT
 !!1!12T
 +,T
 
!T
  T
l
e.>.> 
5<< 
 48
||
 #5<<0
 
	
r2   r  z?The base Gemma 3n language model with a language modeling head.c                       e Zd ZU dgZddiZddgdgfiZeed<   dZddiZ	def fd	Z
d
 Zd Zee	 	 	 	 	 	 	 	 	 	 	 ddeej"                     deej$                     deej"                     dee   deej(                     deej"                     dee   dee   dee   deej"                     deeej$                  f   defd              Z xZS )Gemma3nForCausalLMlm_head.weightlm_headcolwise_repr9   r7   rj   modelzmodel.language_modelc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
rG   rH   r  r  r  rI   ry   rq   r  r  r  s     r3   rH   zGemma3nForCausalLM.__init__  sU     %f-
 ++yy!3!3V5F5FUS 	r2   c                     || _         y rZ   r  rN   decoders     r3   set_decoderzGemma3nForCausalLM.set_decoder  s	    
r2   c                     | j                   S rZ   r  ra   s    r3   get_decoderzGemma3nForCausalLM.get_decoder  s    zzr2   r  rQ  r?  r8   r  labelsr  r  r  rx  logits_to_keeprX   c                 .   | j                   rF| j                  j                  dk7  r-t        j	                  d| j                  j                   d       ||n| j                  j
                  }|	|	n| j                  j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                   ||| j"                  fi |}t%        |||j&                  |j(                  |j*                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```rz  ziIt is strongly recommended to train Gemma3n models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r  rQ  r?  r8   r  r  r  r  rx  )r6   r7   r8   r9   r:   r1   )r  rj   r  r  r  r  r  r  r  r.  rc   r}  r  final_logit_softcappingr.   r   loss_functionr  r   r8   r9   r:   )rN   r  rQ  r?  r8   r  r  r  r  r  rx  r  rW  r  r9   slice_indicesr7   r6   s                     r3   r^   zGemma3nForCausalLM.forward  s   F ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r2   )NNNNNNNNNNr   )r*   r+   r,   _tied_weights_keys_tp_plan_pp_planr#   r0   r  _checkpoint_conversion_mappingrH   r  r  r   r   r   r.   r  re   r
   r/   rd   r   rc   r   r^   rf   rg   s   @r3   r  r    sy   *+=)H_-z:;H&<g%F"0   151537+/59-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 "%K
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
  K
r2   r  c                        e Zd ZdZdeeef   def fdZ	 	 d	de	e
j                     de	e
j                     de
j                  fdZ xZS )
Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 r   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                        | _        t        | j                  | j
                        | _        t        | j                  | j
                        | _        t        j                  | j                  | j                  d      | _        t        | j                  | j
                  d      | _        y )Nr  Frl   )r@   rA   )rG   rH   rq   multimodal_hidden_sizer  r@   vocab_offsetr  text_hidden_sizerI   	Embedding	embeddingr>   hard_embedding_normsoft_embedding_normry   embedding_projectionembedding_post_projection_norm)rN   r  r  rO   s      r3   rH   z"Gemma3nMultimodalEmbedder.__init__>  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r2   r  r  rX   c                     |du |duz  rt        d      || j                  |      }n/| j                  || j                  z
        }| j	                  |      }| j                  |      }| j                  |      S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r   r  r  r  r  r  r  )rN   r  r  emb_normhard_embemb_norm_projs         r3   r^   z!Gemma3nMultimodalEmbedder.forwardQ  s     -t";<YZZ$//>H~~i$2C2C&CDH//9H11(;22=AAr2   r  )r*   r+   r,   r-   r   r!   r$   r#   rH   r   r.   r  re   r^   rf   rg   s   @r3   r  r  ;  sq    [t !35H!HIt 't* 1504BE,,-B  -B 
	Br2   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                        e Zd Zi ZdZdef fdZd Zd Zd Z	d Z
dej                  d	ej                  fd
Zdej                  dej                  dej                  dej                  fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deeeej                     ef      deej                     deej                     deej                     deej                     dee   dee   dee   d	efd       Zdej                  dej                  d	eej                  ej                  f   fdZ xZS )Gemma3nModelFrj   c                    t         |   |       t        j                  |j                        | _        |j                  j                  | _        t        j                  |j                        }|| _        | j                  j                  | j                  j                  nd| _
        |j                  j                  | _        t        j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        | j#                          y )Nr  rQ   )rG   rH   r   from_configvision_configvision_towerr  r  language_modelrj   r  r  audio_configaudio_towerr  embed_visionembed_audior  )rN   rj   r&  rO   s      r3   rH   zGemma3nModel.__init__x  s     %119M9MN ,,77"..f6H6HI,8<8P8P8\DKK44bd*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\]r2   c                 6    | j                   j                         S rZ   )r&  get_input_embeddingsra   s    r3   r,  z!Gemma3nModel.get_input_embeddings  s    ""7799r2   c                 :    | j                   j                  |       y rZ   )r&  set_input_embeddingsrN   r_  s     r3   r.  z!Gemma3nModel.set_input_embeddings  s    007r2   c                     || _         y rZ   r&  r  s     r3   r  zGemma3nModel.set_decoder  s
    %r2   c                     | j                   S rZ   r1  ra   s    r3   r  zGemma3nModel.get_decoder  s    """r2   pixel_valuesrX   c                 t   | j                  |dd      j                  }|j                  |j                  d   | j                  j
                  j                  | j                  j                        j                  ddd      }|| j                  j
                  j                  dz  z  }| j                  |      S )	a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        FT)r3  
do_poolingreturn_dictr   r   r    r  r  )
r%  r  r   r`   rj   r$  rq   vision_soft_tokens_per_imager   r)  )rN   r3  vision_outputss      r3   get_image_featureszGemma3nModel.get_image_features  s     **%%T + 


 	
 (//  #KK%%11KK44
 '!Q
	 	 	$++33??DD  ~ >>r2   r  r  image_featuresaudio_featuresc                    || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|R||   j                         |j                         k7  r.t        d| d|j                  d   |j                  d   z         |j                         }|j                  d      j                  |      j                  |j                        }|R||   j                         |j                         k7  r.t        d| d|j                  d   |j                  d   z         ||fS )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  rQ   z6Image features and image tokens do not match: tokens: z, features r   r    z6Audio features and image tokens do not match: tokens: )r,  r.   rM   rj   image_token_idlongr   allaudio_token_idr*  r   	expand_asr   numelr   r`   )	rN   r  r  r;  r<  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r3   get_placeholder_maskz!Gemma3nModel.get_placeholder_mask  sl    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;.4,,.LL!;!;5::VcVjVjk c"g  "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%-8J*K*Q*Q*SWeWkWkWm*mHHXXcdrdxdxyzd{  M  S  S  TU  V  eV  dW  X  ,//1/99"=GGVYYZgZnZno%-8J*K*Q*Q*SWeWkWkWm*mHHXXcdrdxdxyzd{  M  S  S  TU  V  eV  dW  X  "#555r2   input_featuresrQ  input_features_maskr?  r8   token_type_idsrx  r  r  r  r  c                  	   |du |
duz  rt        d      ||n| j                  j                  }||n| j                  j                  }|1 | j	                         |      }
t        j                  |dk\  || j                  k        }t        j                  ||t        j                  |            }| j                  j                  |      }t        j                  || j                  j                  k\  || j                  j                  k        }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j%                  d      j'                  |
      }t        j                  |||
      }
|| j                  j                  k\  }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j%                  d      j'                  |
      }t        j                  |||
      }
nd}|`| j)                  |      }|j!                  |
j"                  |
j*                        }| j-                  ||
|      \  }}|
j/                  ||      }
|4|1| j1                  ||       \  } }t        j2                  | j                  dz
  ggt
        j4                  | j"                        }!| j                  |!      }"t        j                  |j%                  d      |"|       } | j6                  \  }#}$}%| j                  j8                  |$z
  }&|"j;                  |#|&|%      }'t        j<                  | |'fd	      } | j!                  |
j"                  |
j*                        } | j-                  ||
| 
      \  }}(|
j/                  |(|       }
 | j                  dd|||||
|||d|	d|})t?        |)j@                  |r|)jB                  nd|)jD                  |)jF                  |nd|       S d      S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r    )r  rQ   )r  r;  r  r   )r  r<  T)r  r  rQ  r?  r8   r  r  r  r  r6  rx  )r  r8   r9   r:   r(   r)   r1   )$r   rj   r  r  r,  r.   r   r  r   
zeros_liker&  r  r)  r  r*  r  r   r   r   rB  r:  r   rH  masked_scatterget_audio_featuresrM   r?  r`   audio_soft_tokens_per_imager  r   r'   r  r8   r9   r:   )*rN   r  r3  rI  rQ  rJ  r?  r8   rK  rx  r  r  r  r  r  	lm_kwargsper_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr;  rD  r   r<  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresrE  r  s*                                             r3   r^   zGemma3nModel.forward  s   ^ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	  7D557	BM %*$5$5i1niRVRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++T..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM#.#8#8#<#F#F}#U !KK(<m][M #d&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL","6"6r":"D"D]"S!KK(;\=YM# #!44\BN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I)-)@)@ReQe)f&NJ "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%$%% 
-)%+'/!5)
 
 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r2   c                 T    | j                  ||      \  }}| j                  |      |fS )a-  
        Projects the last hidden state from the audio encoder into language model space.

        Args:
            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
               The tensors corresponding to the input audio.
            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
               The attention mask for the input audio.

        Returns:
            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
        r7  )r(  r*  )rN   rI  rJ  audio_outputsrY  s        r3   rO  zGemma3nModel.get_audio_featuresb  s5     %)$4$4^EX$Y!zm<jHHr2   )NNNNNNNNNNNNNN)r*   r+   r,   r  accepts_loss_kwargsr"   rH   r,  r.  r  r  r.   re   r:  r  r/   rH  r   r   r   r;   r
   rd   r5   r^   r<   rO  rf   rg   s   @r3   r!  r!  m  sA    &("} :8&#?u|| ? ?2(6##(6 (((6 ))	(6
 ))(6T  15486:156:37KO595959-1$(,0/3I
E,,-I
 u001I
 !!2!23	I

 !.I
 &ell3I
 u//0I
 "%U->->(?(F"GHI
 !!1!12I
 !!1!12I
   1 12I
 ))*I
 D>I
 $D>I
 'tnI
" 
'#I
 I
VI#llIAFI	u||U\\)	*Ir2   r!  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            #           e Zd Zi ZdgZdZdef fdZd Zd Z	d Z
d Zd	 Zed
        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d deej*                     deej,                     deej,                     deej.                     deej.                     deej*                     deeeej,                     ef      deej*                     deej*                     deej,                     deej*                     dee   dee   dee   deeej.                  f   def d              Z	 	 	 	 	 	 	 	 	 	 	 	 d! fd	Zed        Z  xZ!S )"Gemma3nForConditionalGenerationr  r  rj   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y r  )rG   rH   r!  r  rI   ry   r  rq   r  r  r  r  s     r3   rH   z(Gemma3nForConditionalGeneration.__init__  sS     !&)
yy!3!3!?!?ASASA^A^ejkr2   c                 6    | j                   j                         S rZ   )r  r,  ra   s    r3   r,  z4Gemma3nForConditionalGeneration.get_input_embeddings  s    zz..00r2   c                 :    | j                   j                  |       y rZ   )r  r.  r/  s     r3   r.  z4Gemma3nForConditionalGeneration.set_input_embeddings  s    

''.r2   c                 :    | j                   j                  |       y rZ   )r  r  r  s     r3   r  z+Gemma3nForConditionalGeneration.set_decoder  s    

w'r2   c                 6    | j                   j                         S rZ   )r  r  ra   s    r3   r  z+Gemma3nForConditionalGeneration.get_decoder  s    zz%%''r2   c                 8    | j                   j                  |      S rZ   )r  r:  )rN   r3  s     r3   r:  z2Gemma3nForConditionalGeneration.get_image_features  s    zz,,\::r2   c                 .    | j                   j                  S rZ   )r  r&  ra   s    r3   r&  z.Gemma3nForConditionalGeneration.language_model  s    zz(((r2   c                 .    | j                   j                  S rZ   )r  r%  ra   s    r3   r%  z,Gemma3nForConditionalGeneration.vision_tower  s    zz&&&r2   c                     t        d      )Nz2Use embed_vision instead of multi_modal_projector.)AttributeErrorra   s    r3   multi_modal_projectorz5Gemma3nForConditionalGeneration.multi_modal_projector  s    QRRr2   r  r3  rI  rQ  rJ  r?  r8   rK  rx  r  r  r  r  r  r  rX   c                    ||n| j                   j                  }||n| j                   j                  } | j                  d	|||||||||	|
||||dd|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                   j                         j                  x}||z  }t        j                  |      }||z  }d}|O|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                         }||j                  |j                         dk7     j#                         }||j                  |j                         dk7     j#                         }n |j#                         }|j#                         }t%        j&                         }|j)                  d| j                   j*                  j,                        }|j)                  d      j                  |j                         } |||      }t/        |||j0                  |j2                  |j4                  |j6                  |j8                        S )
al  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r3  rI  rQ  rJ  r?  r8   rK  rx  r  r  r  r  r  r6  .rQ   r    r   )r6   r7   r8   r9   r:   r(   r)   r1   )rj   r  r  r  r  r.  rc   r}  r  get_text_configr  r.   r   r[   r`   r   r   r   rI   CrossEntropyLossr   r  r  r5   r8   r9   r:   r(   r)   )rN   r  r3  rI  rQ  rJ  r?  r8   rK  rx  r  r  r  r  r  r  rQ  r  r9   r	  r7   r  r6   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                r3   r^   z'Gemma3nForConditionalGeneration.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)) 3%+))'/!5
  !
&  118B>SV8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D,#33!//)) ' ; ; ' ; ;
 	
r2   c                 h    t        |   |f||||||||
d|}|d   dk(  r||d<   ||d<   |	|d<   |S )N)r8   r  rQ  r?  rx  r  r  rK  r   r3  rI  rJ  )rG   prepare_inputs_for_generation)rN   r  r8   r  rx  r?  r3  rI  rQ  rJ  rK  r  r  r  rW  model_inputsrO   s                   r3   r  z=Gemma3nForConditionalGeneration.prepare_inputs_for_generation'	  ss    $ w<
+')%)))
 
  !!+7L(-;L)*2EL./r2   c                 .    | j                   j                  S rZ   )r  r(  ra   s    r3   r(  z+Gemma3nForConditionalGeneration.audio_towerP	  s    zz%%%r2   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNN)"r*   r+   r,   r  r
  r  r"   rH   r,  r.  r  r  r:  propertyr&  r%  rt  r   r   r   r.   r  r/   re   r   r;   r
   rd   rc   r5   r^   r  r(  rf   rg   s   @r3   ri  ri  u  sW    &("*+} 1/((; ) ) ' ' S S  15486:156:37KO595959-1$(,0/334!A
E,,-A
 u001A
 !!2!23	A

 !.A
 &ell3A
 u//0A
 "%U->->(?(F"GHA
 !!1!12A
 !!1!12A
   1 12A
 ))*A
 D>A
 $D>A
 'tnA
  c5<</0!A
$ 
'%A
  A
L  'R & &r2   ri  )r  r  ri  r!  r  r  )r   NN)Nr    )_r  r{   collections.abcr   r   dataclassesr   typingr   r   r.   torch.nnrI   torch.nn.functionalr   r`  r  r	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   autor   configuration_gemma3nr!   r"   r#   r$   
get_loggerr*   r  r'   r5   Moduler>   ri   r   r   rB  rg  r  r  r  r  r  r  r  r  r  r  r'  rH  re   rc   rM  r[   r<   r[  r^  r`  r  r  r  r  r  r!  ri  __all__r1   r2   r3   <module>r     s  ,   . ! "     ! B B ) R B 9 O K F & _ _  l l 
		H	% 
<!8 < <* 
<K < <<=RYY =6g)BII g)T]BII ]@j,bii j,Z@7		 @7FF")) FRORYY O8Dryy D2(ryy (V 6E-/ E-P
SR\\ 
S;RYY ;$#5RYY #5L^'ryy ^'B< <D(	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %N ,0.||.	. 
. 5<<(	.
 .<q)299 q)hZ8 Zz 5_ 5 56 abt
- t
 ct
n ^_d
/ d
 `d
N/B		 /Bd I) IID W&&<o W&W&tr2   