
    rh                    z   d dl mZmZmZmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z*  e(       rd dl+m,Z, d dl-m.Z.m/Z/ ndZ, e'       r	d dl0m1Z1m2Z2 nd\  Z2Z1 e%jf                  e4      Z5 G d ded      Z6 G d de      Z7 G d dejp                        Z9d Z:d ejv                  d!e<d"ejv                  fd#Z=	 dHd$ejp                  d%ejv                  d&ejv                  d'ejv                  d(eejv                     d)e>d*e>d+e e"   fd,Z?dId-Z@ G d. d/ejp                        ZA G d0 d1ej                  jp                        ZBd2ejv                  d3e<fd4ZCd5 ZDd6 ZE eFe,e1e2f      ZGd7 ZH G d8 d9ejp                        ZI G d: d;ejp                        ZJ ed<       G d= d>ejp                               ZK G d? d@e      ZLe# G dA dBe             ZMe# G dC dDeM             ZNe# G dE dFeMe             ZOg dGZPy)J    )AnyCallableOptional	TypedDictUnionN)nn)ACT2FN   )CacheDynamicCacheDynamicLayer)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNNc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.pyr(   r(   ?   s7    " ######__r8   r(   F)totalc                       e Zd ZdZdZdZdZej                  dfde	f fdZ
	 ddej                  dej                  ded	eeeef      d
eej                  ej                  f   f
dZdej&                  fdZddee   d
efdZd
eeej                     eej                     f   fdZeddeeeej0                           d
dfd       Z xZS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    NFconfigc                 V   t         	|   t               |j                  | _        d| _        |j
                  }|j                  }g | _        g | _        g | _	        t        |j                        D ]*  }| j                  |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                   z  |z  z   |||      gz  c_        | xj                  t        j                  ||j"                  |j$                  |||      gz  c_        | xj                  t        j&                  g g|z  |      gz  c_        | xj                  t        j&                  g g|z  |      gz  c_        | j                  j)                  |       - t        |j                        D cg c]  }t        j&                  g g|z  |       c}| _        t        |j                        D cg c]  }t        j&                  g g|z  |       c}| _        y c c}w c c}w )N)layer_classesFmamba   devicedtyperC   )super__init__r   layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr2   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)
selfr=   
batch_sizerD   rC   conv_kernel_sizessm_state_sizei_	__class__s
            r9   rG   z)HybridMambaAttentionDynamicCache.__init__j   s   |4!'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   5"H!6"H&
key_statesvalue_states	layer_idxcache_kwargsreturnc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr   rA   dim)rY   shaperZ   r2   cat)r[   rb   rc   rd   re   s        r9   updatez'HybridMambaAttentionDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr8   beam_idxc                    t        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   V y)zDReorders the cache for beam search, given the selected beam indices.r   N)	rO   lenrY   rC   index_selecttorZ   rL   rM   )r[   rn   rd   rC   s       r9   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   sD   s4>>23 		iI^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI&		ir8   c                     || j                   vr| j                   d   n|}t        | j                        |k  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rN   rp   rY   rk   )r[   rd   s     r9   get_seq_lengthz/HybridMambaAttentionDynamicCache.get_seq_length   sR     3<4CZCZ2ZD++A.`i	t~~)+~~i(..r22r8   c                     t        d      NzIHybridMambaAttentionDynamicCache does not have a legacy cache equivalent.NotImplementedErrorr[   s    r9   to_legacy_cachez0HybridMambaAttentionDynamicCache.to_legacy_cache   s    !"mnnr8   past_key_valuesr   c                     t        d      rx   ry   )clsr}   s     r9   from_legacy_cachez2HybridMambaAttentionDynamicCache.from_legacy_cache   s    !"mnnr8   N)r   )r.   r/   r0   r1   rY   rZ   is_compileabler2   float16r    rG   Tensorr5   r   dictstrr   tuplerm   r3   rs   rv   r|   classmethodFloatTensorr   __classcell__ra   s   @r9   r<   r<   X   s*    IKN>CmmTX %u{ %uX 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F"ie&6&6 i3 3c 3ouU\\':E%,,<O'O!P o ouUEVEV?W9X0Y oes o or8   r<   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )BambaRotaryEmbeddingr=   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)rF   rG   hasattr
isinstancer   r   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr=   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r[   r=   rC   r   ra   s       r9   rG   zBambaRotaryEmbedding.__init__   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r8   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rh   r   mpscpuF)device_typeenabledrA   ri   rD   )r   floatexpandrk   rr   rC   r   r   r   r2   autocast	transposerl   cosr   sinrD   )
r[   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r9   forwardzBambaRotaryEmbedding.forward   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r   )
r.   r/   r0   r    rG   r2   no_gradr   r   r   r   s   @r9   r   r      s3    /{ /" U]]_<  <r8   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrh   rA   ri   )rk   r2   rl   )r   x1x2s      r9   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r8   hidden_statesn_reprf   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rk   r   reshape)r   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrA   r
   ru   rh   )rj   rD   )ptrainingr   )r   num_key_value_groupsr2   matmulr   rk   r   
functionalsoftmaxfloat32rr   rD   r   r   
contiguous)r   r   r   r   r   r   r   r   rb   rc   attn_weightscausal_maskattn_outputs                r9   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rh   .Nri   )	unsqueezerk   r   r2   rl   )qkr   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r9   apply_rotary_pos_embr     s    , --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr8   c                       e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   de
ej                     de
e   d	e
ej                     d
ee   de	ej                  ej                  f   fdZ xZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr=   rd   c                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr   g      Tbias)rF   rG   r=   rd   getattrrS   num_attention_headsr   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_proj)r[   r=   rd   ra   s      r9   rG   zBambaAttention.__init__:  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r8   r   position_embeddingsr   past_key_valuecache_positionr   rf   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nrh   r   rA   )r   r   r   eager        )r   r   )rk   r   r   viewr   r   r   r   rm   rd   r   r=   _attn_implementationr   r   r   r   r   r   r   )r[   r   r   r   r   r   r   input_shapehidden_shapequery_statesrb   rc   r   r   re   attention_interfacer   r   s                     r9   r   zBambaAttention.forwardQ  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r8   r&   )r.   r/   r0   r1   r    r5   rG   r2   r   r   r   r   r3   r   r   r   r   r   s   @r9   r   r   7  s    G
{ 
s 
8 +/59))||)) #5<<#=>)) !.	))
 !)) !!1!12)) +,)) 
u||U\\)	*))r8   r   c                   (     e Zd Zd fd	ZddZ xZS )BambaRMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        y r   rF   rG   r   	Parameterr2   onesweightvariance_epsilonr[   rS   epsra   s      r9   rG   zBambaRMSNormGated.__init__~  s/    ll5::k#:; #r8   c                    |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S NrA   rh   T)keepdim)rD   rr   r2   r   r   r   silupowmeanrsqrtr  r  )r[   r   gateinput_dtypevariances        r9   r   zBambaRMSNormGated.forward  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r8   gư>r   r.   r/   r0   rG   r   r   r   s   @r9   r   r   }  s    $
	;r8   r   input_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )rp   rk   r2   r   r   pad)r  r  	pad_shapes      r9   pad_tensor_by_sizer    sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr8   c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r
   r   rh   rA   )r  rp   rk   r   )r  r  
chunk_sizes      r9   reshape_into_chunksr    s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r8   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    rh   .NrB   diagonalr   ru   ri   )
sizer   r2   trilr  rC   boolmasked_fillcumsuminf)r  r  masktensor_segsums       r9   segment_sumr+    s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr8   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rk   rD   rr   )r   r   rD   s      r9   apply_mask_to_padding_statesr-    sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr8   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	e
   de	ej                     de	ej                     d	e	ej                     f
d
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ	 	 	 	 dde	e
   de	ej                     de	ej                     d	e	ej                     fdZ xZS )
BambaMixeruO  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the HybridCache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r=   rd   c           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        dt;        d      f| _        d| _        d| _         | j                  d| j0                  z  | j                  z  z   | _!        tE        jF                  | jB                  | jB                  |j                  | j                  | jB                  | j                  dz
        | _$        | j                  | jB                  z   | j                  z   }tE        jJ                  | j                  || j(                        | _&        tE        jN                  tQ        jR                  | j                              | _*        tQ        jV                  d| j                  dz         }tE        jN                  tQ        jX                  |            | _-        d	| jZ                  _.        t_        | j                  | j,                  
      | _0        tE        jN                  tQ        jR                  | j                              | _1        d	| jb                  _.        tE        jJ                  | j                  | j                  | j(                        | _2        tf        sth        jk                  d       y th        jk                  d       y )Nr   r(  gMbP?g?rA   r   )in_channelsout_channelsr   kernel_sizegroupspaddingr   Tr  a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6rF   rG   rU   	num_headsrS   rK   r^   rJ   r]   r5   rR   intermediate_sizerd   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrT   n_groupsrV   r   mamba_chunk_sizer  r   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr   in_projr  r2   r  dt_biasarangelogA_log_no_weight_decayr   normDout_projis_fast_path_availableloggerwarning_once)r[   r=   rd   projection_sizeAra   s        r9   rG   zBambaMixer.__init__  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11 !$U5\2" ..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQUQ^Q^_%>  fgr8   r   cache_paramsr   r   r-   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr   r   rh   ri   .r   T)zrK  dt_softplusr   r(  dt_limitF)rQ  r  r-   r<  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesrA   )r
  swish)r   r  r   r<  r-   )r  rQ  rZ  r-   rd  rK  r[  )/r-  rJ  rk   rB  r^   rI   rL   rd   rM   squeezesplitr8  rG  r7  r%   rI  r  r   r<  r2   exprN  r   r   r   rr   r   rK  rQ  r   r!   rP  rR  rD  r   r#   r  r  r   r   r   r  r]   copy_r=  r$   r"   )r[   r   rX  r   r   r-   projected_statesr\   seq_lenr`   groups_time_state_sizeuse_precomputed_statesr  hidden_states_B_CdtBCrW  rK  rQ  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrL   scan_output	ssm_states                              r9   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward%  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r8   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }nt0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.*|(|j                  | j                     j9                  |.       | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nrh   ri   r   r   )shiftsdimsrE   rA   .r   ).NNr   rB   )rj   output_sizer
   r  ru   )r   r   )9rk   rD   r-  rJ  rg  r8  rG  r7  rI   rL   rd   rM   rollrr   rC   rI  r  r2   sumrf  r:  r   r=  r   r   r   r  r]   ri  rB  r^   rh  rN  r   r   r   rK  softplusclamprD  r   r   r   r   bmmrQ  repeat_interleaver  r  r  permuter'  r+  
zeros_likerl   rP  rR  )5r[   input_statesrX  r   r   r\   rk  r`   rD   rj  r  rn  ro  rm  rL   ru  r   rp  rq  rW  cache_devicerK  dAdBdBxrM   ssm_states_reshaped
C_reshapedyrQ  r  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesrw  state_decay_outC_times_statesstate_decay_out_permutedY_offrv  contextualized_statess5                                                        r9   torch_forwardzBambaMixer.torch_forward  sU    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iHii4(
 !%knnU.C D$$G &{s   v	c                 r   t         rAd| j                  j                  j                  j                  v r| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )rS  rJ  r  rC   r   rx  rz   rD   rk   rr   r  )r[   r   rX  r   r   r-   r   rD   s           r9   r   zBambaMixer.forward  s     "f0C0C0J0J0O0O&O,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r8   )NNNN)NNN)r.   r/   r0   r1   r    r5   rG   r2   r   r   r<   r3   r6   rx  r  r   r   r   s   @r9   r/  r/    sI   Ah{ Ahs AhL DH5915-1g||g ?@g !!1!12	g
 !.g %//*gZ DH5915L% ?@L% !!1!12	L%
 !.L%d DH5915-1_ ?@_ !!1!12	_
 !._ %//*_r8   r/  c                   $     e Zd Z fdZd Z xZS )BambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nr   )rF   rG   r=   rS   r8  r   r   mlp_bias	gate_projup_proj	down_projr	   r;  act_fnr[   r=   ra   s     r9   rG   zBambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r8   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )r  r  r  r  )r[   r   r  s      r9   r   zBambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r8   r  r   s   @r9   r  r    s    0r8   r  RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )BambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr   r  s      r9   rG   zBambaRMSNorm.__init__  s1     	ll5::k#:; #r8   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r  )	rD   rr   r2   r   r  r  r  r  r  )r[   r   r  r  s       r9   r   zBambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r8   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   r  rk   r  r{   s    r9   
extra_reprzBambaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr8   r  )r.   r/   r0   rG   r   r  r   r   s   @r9   r  r    s    $;Jr8   r  c                   v    e Zd Zddededef fdZ	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e   d	e	e   d
e	e   de	ej                     de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fdZ xZS )BambaDecoderLayerr=   rd   
layer_typec                 r   t         |           d}|dk(  rt        nd } ||      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        || _	        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr   r6  r@   )r=   rd   	attentionzInvalid layer_type)rF   rG   r  feed_forwardr  rS   r@  input_layernormpre_ff_layernormr  r/  r@   r   	self_attn
ValueError)r[   r=   rd   r  num_expertsffn_layer_classra   s         r9   rG   zBambaDecoderLayer.__init__  s    &1Q&6(D+F3+F,>,>FDWDWX ,V-?-?VEXEX Y$ #6YGDJ;&+FI>DN122r8   r   r   r   r   output_attentions	use_cacher   r   r   rf   c	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r@   )r   rX  r   r   Nr  )r   r   r   r   r  r  r   r   r7   )r  r  r@   r  r  r  )r[   r   r   r   r   r  r  r   r   r   residualself_attn_weightsoutputss                r9   r   zBambaDecoderLayer.forward  s    D !,,]; ??g%&DJJ ++--	
 M !%__+/=t~~ 
0+-)-"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr8   )r@   )NNNFFNN)r.   r/   r0   r    r5   r   rG   r2   r   r   r3   r<   r%  r   r   r(   r   r   r   r   s   @r9   r  r    s   3{ 3s 3 3( 2637EI,1$)59KOK||K !.K u//0	K
 !!ABK $D>K D>K !!1!12K &eELL%,,,F&GHK 23K 
u  (51B1BEDUDU1U+V"WW	XKr8   r  c                   H     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ fdZ xZS )BambaPreTrainedModelr=   modelTr  r}   c                 d   t         |   |       t        |t              r|j                  j
                  j                  d       t        j                  t        j                  d|j                  dz               |j                  _        |j                  j
                  j                  d       y y )Ng      ?r   )rF   _init_weightsr   r/  rK  datafill_r2   rM  rL  r7  rN  rQ  )r[   r   ra   s     r9   r  z"BambaPreTrainedModel._init_weightsF  sx    f%fj)NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ *r8   )r.   r/   r0   r    r4   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  r   r   s   @r9   r  r  :  s>    &*#,-"3NL% %r8   r  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	ee   d
ee   deej                     dee   defd              Zdej                  dej                  dej                  ded	ef
dZedej                  dededej*                  dej                  defd       Zd Z xZS )
BambaModelr=   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)rd   r  r6  )r=   F)rF   rG   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrS   embed_tokensrO   rP   rX   r  rH   
ModuleListlayersr   r  r@  final_layernormr   
rotary_embgradient_checkpointing	post_init)r[   r=   decoder_layersr_   ra   s       r9   rG   zBambaModel.__init__P  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r8   	input_idsr   r   r}   inputs_embedsr  r  output_hidden_statesr   r   rf   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }| j                  |||	||      }| j!                  ||	      }| j#                  ||      }|rdnd }|rdnd }| j$                  D ]E  }|j&                  d	k(  r|n|}|r||fz  } ||f||||||	|d
|
}|d   }|s7|d   =||d   fz  }G | j)                  |      }|r||fz  }|r|j*                  sd|_        |sd n|}t-        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rE   r   r7   r@   )r   r   r   r  r  r   r   T)last_hidden_stater}   r   
attentions)r=   r  r  r  r  r  r   rT  rU  r  r2   rL  rk   rC   r   _update_causal_mask_update_mamba_maskr  r  r  r  rI   r   )r[   r  r   r   r}   r  r  r  r  r   r   r   r   
mamba_maskr   all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r9   r   zBambaModel.forwardc  sT    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..M>?L]
 ,,^^L
 #oom\J"6BD0d![[ 	:M'4'?'?7'JP[J#!m%55!)
))."3#-$7
 
M *!,M  #/"}Q'7&99N1	:4 ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r8   r  c           	         | j                   j                  dk(  r	|d|v r|S y ||j                         nd}| j                   j                  dk(  r&|s$t        j                  |||| j
                        ry |j                  }|j                  d   }t        |t        j                        r|j                  d   n||z   dz   }	| j                  |||	|||j                  d         }
| j                   j                  dk(  rQ|O|j                  j                  d	v r7|s5t        j                  |      j                  }t        j                   |
|      }
|
S )
Nflash_attention_2r   r   sdpa)r  past_key_values_lengthis_trainingr   rh   )sequence_lengthtarget_lengthrD   r   r\   )r  xpunpu)r=   r   rv   r   _ignore_causal_mask_sdpar   rD   rk   r   r2   r   5_prepare_4d_causal_attention_mask_with_cache_positionrC   r   finfomin_unmask_unattended)r[   r   r  r   r}   r  past_seen_tokensrD   r  r  r   	min_dtypes               r9   r  zBambaModel._update_causal_mask  se    ;;++/BB)c^.C%%
 @O?Z?99;`a ;;++v5>O%>>*'7 MM	 ""&,,Q/ .%,,7   $!O3a7 	 PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr8   r  r  rD   r\   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	| ddddddf   | ddddddf   k(  dddd| dddf   j                  |      }
|ddddddd|	f   |
z   }|dk(  }|ddddddd|	f   j                  ||      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuerD   rC   r   r!  rE   rh   r   )rj   r2   r  r  fullrC   triurL  r   r   clonerk   rr   r&  )r   r  r  rD   r   r\   r   r   r  mask_lengthpadding_attention_maskpadding_masks               r9   r  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K, ) E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*8D$9I*Jn]^`dfgim]mNn*nq?*+Q.*"U) '  +1aL[L+@ADZZ+q05@Aq,;,AV5W5c5c )6Aq!\k\12 r8   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r2   all)r[   r   r   r  s       r9   r  zBambaModel._update_mamba_mask;  s7     $
!q ^%?EIIn`aNaDbJr8   )	NNNNNNNNN)r.   r/   r0   r    rG   r   r   r   r2   r3   r   r<   r   r%  r   r(   r   r   r  staticmethodr5   rD   r  r  r   r   s   @r9   r  r  N  s   { &  151537FJ59$(,0/359`
E,,-`
 !.`
 u//0	`

 ""BC`
   1 12`
 D>`
 $D>`
 'tn`
 !!1!12`
 23`
 
!`
  `
D:: ll: 	:
 ::  :x 555 5 {{	5
 5 5 5n	r8   r  c                       e Zd ZdgZddiZddgdgfiZ fdZd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     dee   deej                      deej                     dee   dee   dee   deej                     deeej                  f   defd              Z	 	 	 	 	 	 ddZ xZS )BambaForCausalLMzlm_head.weightlm_headcolwise_repr   logitsc                 
   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        | j                          y )NFr   )rF   rG   r  r  r  r   r   rS   r  z_loss_coefficientr  r  s     r9   rG   zBambaForCausalLM.__init__M  sc     '
 ++yy!3!3V5F5FUS"(";"; 	r8   c                     || _         y r   r  )r[   decoders     r9   set_decoderzBambaForCausalLM.set_decoderW  s	    
r8   c                     | j                   S r   r  r{   s    r9   get_decoderzBambaForCausalLM.get_decoderZ  s    zzr8   r  r   r   r}   r  labelsr  r  r  r   logits_to_keeprf   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d
||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  d
||| j                   j                  d|}| j                  dkD  r[|j                  d      j                  |j                        j                  d      j!                         }|| j                  |z  z   }t#        |||j$                  |j&                  |j(                  	      S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   r   r}   r  r  r  r  r   )r  r  r  r   rh   ri   r   rA   )lossr  r}   r   r  r7   )r=   r  r  r  r  r   r5   slicer  loss_functionr  r  	logsumexprr   rD   r  r  r   r}   r   r  )r[   r  r   r   r}   r  r  r  r  r  r   r  r   r  r   slice_indicesr  r  z_losss                      r9   r   zBambaForCausalLM.forward]  sw   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD&&*))b)1444::4FJJ1MRRTd55>>%#33!//))
 	
r8   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nrh   r   r   rE   r  r  )r   r}   r  r   r  r   )rk   r<   r=   rD   rC   longr'  masked_fill_r   rm   num_logits_to_keep)r[   r  r}   r   r  r   r   r  r   empty_past_kvmodel_inputss              r9   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  sa    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r8   )NNNNNNNNNNr   )NNNNNT)r.   r/   r0   _tied_weights_keys_tp_plan_pp_planrG   r  r  r   r   r   r2   r3   r   r<   r   r%  r   r5   r   r   r*  r   r   s   @r9   r  r  G  sv   *+=)H_-z:;H  151537FJ59-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 ""BCK
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
  K
` 8r8   r  )r  r  r  )r   )Nr   )Qtypingr   r   r   r   r   r2   r   transformers.activationsr	   cache_utilsr   r   r   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.import_utilsr   r   configuration_bambar    +mamba_ssm.ops.triton.selective_state_updater!   !mamba_ssm.ops.triton.ssd_combinedr"   r#   causal_conv1dr$   r%   
get_loggerr.   rT  r(   r<   Moduler   r   r   r5   r   r   r   r   r   r   r  r  r+  r  rS  r-  r/  r  r  r  r  r  r  __all__r7   r8   r9   <module>rB     s  6 = <   + < < ) 7 > 9 O K F & R R V , Rmm!DD-7** 
		H	%	 2dou doN<299 <D(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%6%PC)RYY C)L; ;*VU\\ VS V
(( 46FH\]^ ^_ ^_Bryy   Y'J299 J (J(]2 ]@ %? % %& u% u up \+_ \ \~ Er8   