
    rh                        d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZ d	d
lmZ d	dlmZ d	dlmZmZ d	dl m!Z! d	dl"m#Z# d	dl$m%Z%m&Z&m'Z' d	dl(m)Z)m*Z* ddl+m,Z,  e*       rddl-m.Z. ddl/m0Z0m1Z1 ndZ. e)       r	ddl2m3Z3m4Z4 nd\  Z4Z3 e5e.e3e4f      Z6 e'jn                  e8      Z9 G d ded      Z: G d de      Z G d de      Z;d4dZ< G d  d!e      Z= G d" d#e      Z>d$ Z? G d% d&ej                        ZA G d' d(e      ZB G d) d*e      ZC G d+ d,e      ZDe% G d- d.e!             ZEe% G d/ d0eE             ZF G d1 d2e      ZGg d3ZHy)5zPyTorch Bamba model.    )Optional	TypedDictUnionN)nn)ACT2FN) HybridMambaAttentionDynamicCacheJambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedpad_tensor_by_sizereshape_into_chunkssegment_sum   )DynamicLayer)AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_update)NNc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     z/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.pyr(   r(   L   s7    " ######__r8   r(   F)totalc                   6    e Zd ZdZej
                  dfdefdZy)r   a  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfigc                 `   t        j                  t               |j                  | _        d| _        |j
                  }|j                  }g | _        g | _        g | _	        t        |j                        D ]*  }| j                  |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                   z  |z  z   |||      gz  c_        | xj                  t        j                  ||j"                  |j$                  |||      gz  c_        | xj                  t        j&                  g g|z  |      gz  c_        | xj                  t        j&                  g g|z  |      gz  c_        | j                  j)                  |       - t        |j                        D cg c]  }t        j&                  g g|z  |       c}| _        t        |j                        D cg c]  }t        j&                  g g|z  |       c}| _        y c c}w c c}w )N)layer_classesFmamba   devicedtyperB   )r   __init__r   layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr2   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)	selfr<   
batch_sizerC   rB   conv_kernel_sizessm_state_sizei_s	            r9   rE   z)HybridMambaAttentionDynamicCache.__init__t   s   (11M!'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   :"H&;"H+)r.   r/   r0   r1   r2   float16r!   rE   r7   r8   r9   r   r   f   s"     ?DmmTX %u{ %ur8   r   c                       e Zd Zy)BambaRotaryEmbeddingNr.   r/   r0   r7   r8   r9   ra   ra          r8   ra   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Ndim)	unsqueezeshaper   r2   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r9   apply_rotary_pos_embrx      s    , --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr8   c                       e Zd Zy)BambaAttentionNrb   r7   r8   r9   rz   rz      rc   r8   rz   c                       e Zd Zy)BambaRMSNormGatedNrb   r7   r8   r9   r|   r|      rc   r8   r|   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr    r   )ri   rC   to)hidden_statesattention_maskrC   s      r9   apply_mask_to_padding_statesr      sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr8   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	e
   de	ej                     de	ej                     d	e	ej                     f
d
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ	 	 	 	 dde	e
   de	ej                     de	ej                     d	e	ej                     fdZ xZS )
BambaMixeruO  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the HybridCache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r<   	layer_idxc           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        dt;        d      f| _        d| _        d| _         | j                  d| j0                  z  | j                  z  z   | _!        tE        jF                  | jB                  | jB                  |j                  | j                  | jB                  | j                  dz
        | _$        | j                  | jB                  z   | j                  z   }tE        jJ                  | j                  || j(                        | _&        tE        jN                  tQ        jR                  | j                              | _*        tQ        jV                  d| j                  dz         }tE        jN                  tQ        jX                  |            | _-        d	| jZ                  _.        t_        | j                  | j,                  
      | _0        tE        jN                  tQ        jR                  | j                              | _1        d	| jb                  _.        tE        jJ                  | j                  | j                  | j(                        | _2        tf        sth        jk                  d       y th        jk                  d       y )N        infgMbP?g?r@   r    )in_channelsout_channelsbiaskernel_sizegroupspadding)r   Tepsa  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6superrE   rS   	num_headsrQ   rI   r\   rH   r[   r5   rP   intermediate_sizer   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrR   n_groupsrT   head_dimmamba_chunk_size
chunk_sizefloattime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr2   onesdt_biasarangelogA_log_no_weight_decayr|   normDout_projis_fast_path_availableloggerwarning_once)rY   r<   r   projection_sizeA	__class__s        r9   rE   zBambaMixer.__init__   s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11 !$U5\2" ..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQUQ^Q^_%>  fgr8   r   cache_paramscache_positionr   r-   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr    r   re   rf   .rC   T)zr   dt_softplusr   r   dt_limitF)r   r   r-   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr@   )siluswish)xweightr   r   r-   )r   r   r   r-   r   r   r   )/r   r   ri   r   r\   rG   rJ   r   rK   squeezesplitr   r   r   r&   r   r   r   r   r2   expr   r   expandr   r~   float32r   r   viewr"   r   r   r   trainingr$   r   variance_epsilon	transposer   
functionalpadr[   copy_r   r%   r#   )rY   r   r   r   r   r-   projected_statesrZ   seq_lenr^   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrJ   scan_output	ssm_states                              r9   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward.  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r8   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }nt0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.*|(|j                  | j                     j9                  |.       | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nre   rf   r    r   )shiftsdimsrD   r@   .).N).NNr   rA   )rg   output_sizer      )r    r   )9ri   rC   r   r   r   r   r   r   rG   rJ   r   rK   rollr~   rB   r   r   r2   sumr   r   r   r   r   r   r   r   r[   r   r   r\   r   r   r   r   r   r   softplusclampr   r   reshape
contiguousr   bmmr   repeat_interleaver   r   r   permutecumsumr   
zeros_likerj   r   r   )5rY   input_statesr   r   r   rZ   r   r^   rC   r   r   r   r   r   rJ   r   r   r   r   r   cache_devicer   dAdBdBxrK   ssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess5                                                        r9   torch_forwardzBambaMixer.torch_forward  sU    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iHii4(
 !%knnU.C D$$G &{s   v	c                 r   t         rAd| j                  j                  j                  j                  v r| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r    r   )r   r   r   rB   typer   NotImplementedErrorrC   ri   r~   r  )rY   r   r   r   r   r-   kwargsrC   s           r9   forwardzBambaMixer.forward  s     "f0C0C0J0J0O0O&O,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r8   )NNNN)NNN)r.   r/   r0   r1   r!   r5   rE   r2   Tensorr   r   r3   r6   r   r  r  __classcell__r   s   @r9   r   r      sI   Ah{ Ahs AhL DH5915-1g||g ?@g !!1!12	g
 !.g %//*gZ DH5915L% ?@L% !!1!12	L%
 !.L%d DH5915-1_ ?@_ !!1!12	_
 !._ %//*_r8   r   c                       e Zd Zy)BambaMLPNrb   r7   r8   r9   r!  r!    rc   r8   r!  c                       e Zd Zy)BambaRMSNormNrb   r7   r8   r9   r#  r#    rc   r8   r#  c                   v    e Zd Zddededef fdZ	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e   d	e	e   d
e	e   de	ej                     de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fdZ xZS )BambaDecoderLayerr<   r   
layer_typec                     t         |           | `d}|dk(  rt        nd } ||      | _        || _        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr    r?   )r<   r   	attentionzInvalid layer_type)
r   rE   	self_attnr!  feed_forwardr&  r   r?   rz   
ValueError)rY   r<   r   r&  num_expertsffn_layer_classr   s         r9   rE   zBambaDecoderLayer.__init__  sr    N&1Q&6(D+F3$ #6YGDJ;&+FI>DN122r8   r   r   ro   past_key_valueoutput_attentions	use_cacher   position_embeddingsr  returnc	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r?   )r   r   r   r   Nr(  )r   r   ro   r.  r/  r0  r   r1  r7   )input_layernormr&  r?   r)  pre_ff_layernormr*  )rY   r   r   ro   r.  r/  r0  r   r1  r  residualself_attn_weightsoutputss                r9   r  zBambaDecoderLayer.forward  s    D !,,]; ??g%&DJJ ++--	
 M !%__+/=t~~ 
0+-)-"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr8   )r?   )NNNFFNN)r.   r/   r0   r!   r5   strrE   r2   r  r   r3   r   booltupler   r(   FloatTensorr  r  r  s   @r9   r%  r%    s   3{ 3s 3 3( 2637EI,1$)59KOK||K !.K u//0	K
 !!ABK $D>K D>K !!1!12K &eELL%,,,F&GHK 23K 
u  (51B1BEDUDU1U+V"WW	XKr8   r%  c                   H     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ fdZ xZS )BambaPreTrainedModelr<   modelTr%  past_key_valuesc                 d   t         |   |       t        |t              r|j                  j
                  j                  d       t        j                  t        j                  d|j                  dz               |j                  _        |j                  j
                  j                  d       y y )Ng      ?r    )r   _init_weights
isinstancer   r   datafill_r2   r   r   r   r   r   )rY   moduler   s     r9   rB  z"BambaPreTrainedModel._init_weights2  sx    f%fj)NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ *r8   )r.   r/   r0   r!   r4   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrB  r  r  s   @r9   r>  r>  &  s>    &*#,-"3NL% %r8   r>  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	ee   d
ee   deej                     dee   defd              Zdej                  dej                  dej                  ded	ef
dZedej                  dededej*                  dej                  defd       Zd Z xZS )
BambaModelr<   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)r   r&  r   )r<   F)r   rE   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrQ   embed_tokensrM   rN   rV   r%  rF   
ModuleListlayers_attn_implementationr#  r   final_layernormra   
rotary_embgradient_checkpointing	post_init)rY   r<   decoder_layersr]   r   s       r9   rE   zBambaModel.__init__<  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r8   	input_idsr   ro   r@  inputs_embedsr0  r/  output_hidden_statesr   r  r2  c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }| j                  |||	||      }| j!                  ||	      }| j#                  ||      }|rdnd }|rdnd }| j$                  D ]E  }|j&                  d	k(  r|n|}|r||fz  } ||f||||||	|d
|
}|d   }|s7|d   =||d   fz  }G | j)                  |      }|r||fz  }|r|j*                  sd|_        |sd n|}t-        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r    rD   r   r7   r?   )r   ro   r.  r/  r0  r   r1  T)last_hidden_stater@  r   
attentions)r<   r/  r`  r0  r+  r[  r   r   r   rU  r2   r   ri   rB   rh   _update_causal_mask_update_mamba_maskrZ  rW  r&  rY  rG   r   )rY   r^  r   ro   r@  r_  r0  r/  r`  r   r  r   causal_mask
mamba_maskr1  all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r9   r  zBambaModel.forwardO  sT    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..M>?L]
 ,,^^L
 #oom\J"6BD0d![[ 	:M'4'?'?7'JP[J#!m%55!)
))."3#-$7
 
M *!,M  #/"}Q'7&99N1	:4 ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r8   input_tensorc           	         | j                   j                  dk(  r	|d|v r|S y ||j                         nd}| j                   j                  dk(  r&|s$t        j                  |||| j
                        ry |j                  }|j                  d   }t        |t        j                        r|j                  d   n||z   dz   }	| j                  |||	|||j                  d         }
| j                   j                  dk(  rQ|O|j                  j                  d	v r7|s5t        j                  |      j                  }t        j                   |
|      }
|
S )
Nflash_attention_2r   r   sdpa)r_  past_key_values_lengthis_trainingr    re   )sequence_lengthtarget_lengthrC   r   rZ   )r  xpunpu)r<   rX  get_seq_lengthr   _ignore_causal_mask_sdpar   rC   ri   rC  r2   r  5_prepare_4d_causal_attention_mask_with_cache_positionrB   r  finfomin_unmask_unattended)rY   r   rn  r   r@  r/  past_seen_tokensrC   rt  ru  rf  	min_dtypes               r9   rd  zBambaModel._update_causal_mask  se    ;;++/BB)c^.C%%
 @O?Z?99;`a ;;++v5>O%>>*'7 MM	 ""&,,Q/ .%,,7   $!O3a7 	 PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr8   rt  ru  rC   rZ   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	| ddddddf   | ddddddf   k(  dddd| dddf   j                  |      }
|ddddddd|	f   |
z   }|dk(  }|ddddddd|	f   j                  ||      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuerC   rB   r    )diagonalrD   re   r   )rg   r2   r{  r|  fullrB   triur   r   r   cloneri   r~   masked_fill)r   rt  ru  rC   r   rZ   r  rf  r  mask_lengthpadding_attention_maskpadding_masks               r9   rz  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K, ) E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*8D$9I*Jn]^`dfgim]mNn*nq?*+Q.*"U) '  +1aL[L+@ADZZ+q05@Aq,;,AV5W5c5c )6Aq!\k\12 r8   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr    )r2   all)rY   r   r   rg  s       r9   re  zBambaModel._update_mamba_mask'  s7     $
!q ^%?EIIn`aNaDbJr8   )	NNNNNNNNN)r.   r/   r0   r!   rE   r   r   r   r2   r3   r  r   r<  r:  r   r(   r   r  rd  staticmethodr5   rC   rz  re  r  r  s   @r9   rO  rO  :  s   { &  151537FJ59$(,0/359`
E,,-`
 !.`
 u//0	`

 ""BC`
   1 12`
 D>`
 $D>`
 'tn`
 !!1!12`
 23`
 
!`
  `
D:: ll: 	:
 ::  :x 555 5 {{	5
 5 5 5n	r8   rO  c                   T    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     deej                     dee
   d	ee
   d
ee
   deej                     deeej                  f   defdZ	 	 	 	 	 	 ddZ xZS )BambaForCausalLMc                 f    t         |   |       |j                  | _        | j                          y )N)r   rE   z_loss_coefficientr\  )rY   r<   r   s     r9   rE   zBambaForCausalLM.__init__4  s*     "(";"; 	r8   r^  r   ro   r@  r_  labelsr0  r/  r`  r   logits_to_keepr2  c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d
||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  d
||| j                   j                  d|}| j                  dkD  r[|j                  d      j                  |j                        j                  d      j!                         }|| j                  |z  z   }t#        |||j$                  |j&                  |j(                  	      S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r^  r   ro   r@  r_  r0  r/  r`  r   )logitsr  rS  r   re   rf   r   r@   )lossr  r@  r   rc  r7   )r<   r/  r`  r?  rb  rC  r5   slicelm_headloss_functionrS  r  	logsumexpr~   rC   powmeanr   r@  r   rc  )rY   r^  r   ro   r@  r_  r  r0  r/  r`  r   r  r  r8  r   slice_indicesr  r  z_losss                      r9   r  zBambaForCausalLM.forward;  sw   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD&&*))b)1444::4FJJ1MRRTd55>>%#33!//))
 	
r8   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nre   r    r   rD   r_  r^  )ro   r@  r0  r   r  r   )ri   r   r<   rC   rB   longr   masked_fill_r   updatenum_logits_to_keep)rY   r^  r@  r   r_  r   ro   r0  r  empty_past_kvmodel_inputss              r9   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  sa    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r8   )NNNNNNNNNNr   )NNNNNT)r.   r/   r0   rE   r   r2   r3   r  r   r<  r:  r   r5   r   r  r  r  r  s   @r9   r  r  3  s-    151537FJ59-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 ""BCK
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
` 8r8   r  )rO  r  r>  )Nr    )Ir1   typingr   r   r   r2   torch.utils.checkpointr   transformers.activationsr   (transformers.models.jamba.modeling_jambar   r	   (transformers.models.llama.modeling_llamar
   r   r   r   r   r   *transformers.models.mamba2.modeling_mamba2r   r   r   r   cache_utilsr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_bambar!   +mamba_ssm.ops.triton.selective_state_updater"   !mamba_ssm.ops.triton.ssd_combinedr#   r$   causal_conv1dr%   r&   r  r   
get_loggerr.   r   r(   ra   rx   rz   r|   r   Moduler   r!  r#  r%  r>  rO  r  __all__r7   r8   r9   <module>r     s  (  - -    + q   ( > O - & 
 W , Rmm!DD-7**46FH\]^  
		H	%	 43u'G 3ul	/ 	
%P	^ 		) 	^_ ^_B	x 		< 	]2 ]@ %? % %& u% u upM' M` Er8   