
    rhk                     "   d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ  ej6                  e      Z e       rddlmZ ddl m!Z!m"Z" nd\  Z!Z"Z e       r	ddl#m$Z$m%Z% nd\  Z%Z$ e&ee!e"e$e%f      Z'dejP                  de)fdZ*d Z+d Z,d Z- G d d      Z. G d dej                  j^                        Z0 G d de	j^                        Z1 G d  d!e	j^                        Z2 G d" d#e      Z3e G d$ d%e             Z4e ed&'       G d( d)e                    Z5e ed*'       G d+ d,e                    Z6e G d- d.e4             Z7 ed/'       G d0 d1e4e             Z8g d2Z9y)3zPyTorch MAMBA2 model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )Mamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_update)NNinput_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shapes      }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_sizer'   B   sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UU    c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r      )r'   r    r!   reshape)r   r   
chunk_sizes      r&   reshape_into_chunksr.   M   s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r(   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r*   .Ndevicedtype)diagonalr   dim)
sizeexpandr"   trilonesr2   boolmasked_fillcumsuminf)r   r-   masktensor_segsums       r&   segment_sumrB   a   s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr(   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r!   r3   to)hidden_statesattention_maskr3   s      r&   apply_mask_to_padding_statesrG   u   sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr(   c            
           e Zd ZdZej
                  dfdededej                  de	e
   fdZ	 dded	ej                  d
edej                  fdZdedej                  fdZd Zy)Mamba2Cachea  
    Arguments:
        config: Mamba2Config
        batch_size: int
        dtype: torch.dtype
        device: torch.device

    Attributes:
        dtype: (`torch.dtype`):
            The default `dtype` used to initializing the cache.
        conv_kernel_size: (`int`):
            Model's convolution kernel size taken from config.
        n_groups: (`int`):
            Model's number of groups taken from the config - similar to tensor parallel in Transformer.
        state_size: (`int`):
            Model's SSM state size taken from config.
        num_heads: (`int`):
            The number of heads used in the linear attention / SSM.
        head_dim: (`int`):
            The respective dimension of the heads used in the linear attention / SSM.
        intermediate_size: (`int`):
            Model's intermediate_size based on (expand * hidden_dim) from config.
        conv_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states.
        ssm_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states.
    Nconfig
batch_sizer3   r2   c           	      R   || _         |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  |j                  z        | _
        t        j                  |j                  || j                  d| j                  z  | j                  z  z   | j                  ||      | _        t        j                  |j                  || j
                  | j                  | j                  ||      | _        y )Nr+   r1   )r3   conv_kernelconv_kernel_sizen_groups
state_size	num_headshead_dimintr9   hidden_sizeintermediate_sizer"   zerosnum_hidden_layersconv_states
ssm_states)selfrJ   rK   r3   r2   s        r&   __init__zMamba2Cache.__init__   s     
 & 2 2 ++))!$V]]V5G5G%G!H ;;$$""Q%6%HH!!
  ++$$NNMMOO
r(   	layer_idxnew_conv_state
cache_initreturnc                 p   |r3|j                  | j                  j                        | j                  |<   ns| j                  |   j                  dd      | j                  |<   |d d dd d f   j                  | j                  j                        | j                  |   d d d d df<   | j                  |   S )Nr*   )shiftsdimsr   )rD   rX   r2   roll)rZ   r\   r]   r^   s       r&   update_conv_statezMamba2Cache.update_conv_state   s     *8*;*;D<L<L<S<S*TDY'*.*:*:9*E*J*JRT[]*J*^DY'4B1a74K4N4NtO_O_OfOf4gDY'1b1	**r(   new_ssm_statec                     |j                  | j                  j                        | j                  |<   | j                  |   S N)rD   rY   r2   )rZ   r\   re   s      r&   update_ssm_statezMamba2Cache.update_ssm_state   s4    %2%5%5doo6L6L%M	"y))r(   c                 l    | j                   j                          | j                  j                          y rg   )rX   zero_rY   rZ   s    r&   resetzMamba2Cache.reset   s$     r(   )F)__name__
__module____qualname____doc__r"   float16r   rS   r3   r   strr[   Tensorr<   rd   rh   rl    r(   r&   rI   rI      s    : KP--qu
"
03
<AKK
aijman
< PU++.3ll+HL+	+*# *ell * r(   rI   c                   (     e Zd Zd fd	ZddZ xZS )MambaRMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        y rg   superr[   r   	Parameterr"   r;   weightvariance_epsilonrZ   rT   eps	__class__s      r&   r[   zMambaRMSNormGated.__init__   s/    ll5::k#:; #r(   c                    |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S Nr+   r*   T)keepdim)r3   rD   r"   float32r   r#   silupowmeanrsqrtr|   r{   )rZ   rE   gateinput_dtypevariances        r&   forwardzMambaRMSNormGated.forward   s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r(   gư>rg   rm   rn   ro   r[   r   __classcell__r   s   @r&   rv   rv      s    $
	;r(   rv   c            
       ^    e Zd ZdZdedef fdZ	 	 	 ddej                  de	e
   de	ej                     de	ej                     fd	Z	 	 	 ddej                  de	e
   de	ej                     de	ej                     fd
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ xZS )Mamba2Mixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    rJ   r\   c           	      b   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                  | j                  z        | _
        t        |j                        | _        || _        |j                  | _        |j                  | _        t         |j                     | _        |j$                  | _        |j&                  | _        |j(                  | _        |j*                  | _        |j,                  | _        |j.                  | _        |j0                  | _        |j2                  | _        | j                  d| j(                  z  | j
                  z  z   | _        t7        j8                  | j4                  | j4                  |j                  |j                  | j4                  |j                  dz
        | _        | j                  | j4                  z   | j                  z   }t7        j<                  | j                  ||j>                        | _         t7        jB                  tE        jF                  | j                              | _$        tE        jJ                  d| j                  dz         }t7        jB                  tE        jL                  |            | _'        d| jN                  _(        tS        | j                  | j$                        | _*        t7        jB                  tE        jF                  | j                              | _+        d| jV                  _(        t7        j<                  | j                  | j                  |j>                        | _,        |j>                  | _        tZ        st\        j_                  d       y y )Nr+   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingr   Tr~   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0ry   r[   rQ   rT   rP   ssm_state_sizerM   rN   rS   r9   rU   time_step_rankr\   use_conv_bias
hidden_act
activationr   actlayer_norm_epsilonrms_normrO   rR   r-   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearuse_biasin_projrz   r"   r;   dt_biasarangelogA_log_no_weight_decayrv   normDout_projis_fast_path_availableloggerwarning_once)rZ   rJ   r\   projection_sizeAr   s        r&   r[   zMamba2Mixer.__init__   s   ))!--$// & 2 2!$V]]T5E5E%E!F!&"7"78"#11 ++&++,"(";"; ++%55#11#11..T]]1BTEXEX1XXii%%**==&&*
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQWQ`Q`a%> &r(   rE   cache_paramscache_positionrF   c                    t        ||      }| j                  |      }|j                  \  }}}| j                  | j                  z  }	|j                  d   d| j
                  z  z
  d| j                  z  | j                  z  z
  | j                  z
  dz  }
|||d   dkD  r|j                  d      j                  |
|
| j
                  | j                  | j                  gd      \  }}}}}t        ||j                  | j                     | j                  j                  j                  d      | j                  j                  | j                         }t#        j                  || j
                  |	|	gd      \  }}}t#        j$                  | j&                  j)                                }|d d d df   d d d d d f   j+                  d| j,                  | j                        j/                  t"        j0                        }|d d d d d f   j+                  dd| j,                        }| j2                  d d d df   j+                  d| j,                        }| j4                  d d d df   j+                  d| j,                        }|j7                  || j                  |j                  d   | j                  z        }|j7                  || j                  |j                  d   | j                  z        }|j7                  || j                  | j,                        }t9        |j:                  | j                     ||||||d |d	
      }|j7                  || j                  | j,                  z        }| j=                  ||      }| j?                  |      d d d df   }|S t#        j$                  | j&                  j)                                }| j@                  d
t)        d      fk(  ri nd| j@                  i}| jB                  r|tE        || j                  j                  j                  d      | j                  j                  | j2                  |f| j4                  | jF                  d | j                   | j<                  j                  | j<                  jH                  | j>                  j                  | j>                  j                  | j,                  | j                  ddd|}|S |j                  |
|
| j
                  | j                  | j                  gd      \  }}}}}|l|jK                  dd      }tL        jN                  jQ                  ||jR                  |j                  d   z
  df      }|jU                  | j                  |d       | j                   dvrH| jW                  | j                  |jK                  dd            dd |f   jK                  dd            }nptY        |jK                  dd      | j                  j                  j                  d      | j                  j                  | j                         jK                  dd      }t        ||      }t#        j                  || j
                  |	|	gd      \  }}}t[        |j7                  ||d| j,                        |||j7                  ||| j                  d      |j7                  ||| j                  d      f| jF                  | j4                  d d d| j2                  dd|\  }}|||j]                  | j                  |       |j7                  ||d      }| j=                  ||      }| j?                  |      }|S )Nr*   r+   r   r   r6   .r3   T)zr   dt_softplusg        r?   dt_limitF)r   r-   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr\   r]   r^   )r   swish)xr{   r   r   )r-   r   r   r   r   r   r   r\   re   )/rG   r   r!   rO   r   rU   rQ   squeezesplitr   r   rX   r\   r   r{   r   r   r"   expr   floatr9   rR   rD   r   r   r   viewr   rY   r   r   r   trainingr   r-   r|   	transposer   r#   r$   rN   rd   r   r   r   rh   )rZ   rE   r   r   rF   projected_statesrK   seq_len_groups_time_state_sized_mlpr   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrX   scan_output	ssm_states                             r&   cuda_kernels_forwardz Mamba2Mixer.cuda_kernels_forward)  s    5]NS<<6 "/!4!4
GQ!%1D1D!D""2&$((()$--$"5"556 nn  #(B~VWGX[\G\0@0H0H0K0Q0Qt55t}}dnnU[] 1R 1-Aq$)2
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<Cz 
s 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%h 
} 5E4J4JE4#9#94==$..Y_a 5K 511d-r  +3D3N3NqRS3T0"$--"3"34%669U9[9[\^9__abc#K !22"&..Y] 3  ??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'??	)
  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff (, LL $* &*&Y" (\-E 11DNNZc1d)..z7BG"iiT: mmK0
r(   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j                   d   d| j                  z  z
  d| j
                  z  | j                  z  z
  | j                  z
  dz  }
|	j                  |
|
| j                  | j                  | j                  gd      \  }}}}}|||d   dkD  r|j                  | j                  |d       |j                  | j                     j                  | j                  j                  j                         }t#        j$                  || j                  j                  j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|l|j/                  dd      }t0        j2                  j5                  ||j6                  |j                   d   z
  df      }|j                  | j                  |d	       | j-                  | j                  |j/                  dd            d
d |f   j/                  dd            }t        ||      }t#        j                  || j                  | j
                  | j                  z  | j
                  | j                  z  gd      \  }}}t#        j8                  | j:                  j=                                }|||d   dkD  r|j>                  j                   }|d d dd d f   d d d d
f   }|j/                  dd      jA                  ||j                   d   | jB                        }| jD                  d   jA                  | jD                  j                   d   | jB                        }t"        j0                  j2                  jG                  ||j                  |j                        z         }t#        jH                  || jJ                  d   | jJ                  d         }|d   jA                  | j                  | jB                  | j                        j                  t"        jL                        }t#        j8                  |d   |z        j                  |      }|jO                  || j
                  d      d
d d d f   }|jA                  || j
                  | j                  | j
                  z  |j                   d         jQ                         }|jO                  |d|j                   d         }|d   |d
d d d f   z  }|jO                  |d| jB                        }||d   z  j                  |      }|jS                  | j                  |j>                  | j                     |z  |z          |jO                  || j
                  d      d
d d d f   }|jA                  || j
                  | j                  | j
                  z  |j                   d         jQ                         }|jO                  |d|j                   d         }|j>                  | j                     j                  |j                   |j                        }|jU                  || j                  z  | jB                  | j                        }|jU                  || j                  z  | j                  d      }t#        jV                  ||      }|jU                  || j                  | jB                        }| jX                  d   jA                  | jX                  j                   d   | jB                        }|||z  z   j                  |j                        }|jO                  |d      d d d d
f   }nt0        j2                  jG                  || jD                  z         }t#        jH                  || jJ                  d   | jJ                  d         }|jO                  ||d| jB                        j=                         }|jO                  ||d| j                        j=                         }|jO                  ||d| j                        j=                         }|j[                  | j                  | j
                  z  d| j                        }|j[                  | j                  | j
                  z  d| j                        }| j\                  || j\                  z  z
  | j\                  z  }| jX                  d   t_        ||      z  }||d   z  }|j                  |j                        |z  }||||fD cg c]  }ta        ||| j\                         c}\  }}}}|jc                  dddd      }t#        jd                  |d      } t#        j8                  tg        |            }!|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }"|"j%                  d      }#|#d   |!jc                  ddddd      d   z  }$|$j%                  d      }%|%d   |d d d d d f   z  j%                  d      }&t#        j8                  | d d d d d d dd f   | z
        }'||'jc                  dddd      d   z  }(|(d
d d d f   |d   z  j%                  d      })|F|D|d   dkD  r<|j>                  | j                     d d d d
f   j                  |)j                         }*nt#        jh                  |)d d d df         }*t#        jj                  |*|)gd      })t#        j8                  tg        t0        j2                  j5                  | d d d d d d df   d                  }+|+j/                  dd      }+|+d   |)d d d d d d
f   z  j%                  d      },|,d d d df   |,d d df   }-})t#        j8                  |       }.|d
d d d f   |)d d d d d d
f   z  }/|.jc                  dddd      }0|/j%                  d      |0d   z  }1|&|1z   }|jO                  |d| j                  | jB                        }||z   }|dkD  r|d d d |d d d d f   }|jO                  ||d      }|-||jS                  | j                  |-       | jm                  ||      }2| jo                  |2j                  |            }3|3S c c}w )Nr*   r+   r6   r   Fr   r2   r   T.r0   ).NNr   r   r1   )r7   output_sizer   r   r5   )r   r   )8r!   r3   rG   r   rU   rO   r   rQ   r   r   rd   r\   rX   rD   r   r{   r2   r"   sumr   r   r   r   r   r   r#   r$   rN   r   r   r   rY   r9   rR   r   softplusclampr   r   r,   
contiguousrh   r   bmmr   repeat_interleaver-   r'   r.   permuter>   rB   
zeros_likecatr   r   )4rZ   rE   r   r   rF   rK   r   r   r3   r   r   r   r   r   rX   r   r   r   r   cache_devicer   dAdBdBxrY   ssm_states_reshaped
C_reshapedyr   r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess4                                                       r&   torch_forwardzMamba2Mixer.torch_forward  s    "/!4!4
GQ## 5]NS<<6!''+a$2H2H.HH1t}}K\_c_r_rKrrsw  tB  tB  B  GH  H,<,B,Bt55t~~V\^ -C -
)1d%r
 #(B~VWGX[\G\**T^^Terw*x '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//0<3P3PSoSuSuvxSy3y{|2} ..Xcpt.u $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**#(B~VWGX[\G\'2299L Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ))..*55dnnEJSP *  		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF 'N,F>Z[K\_`K`"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A--V_-`ii4(
 !%knnU.C D$$G &{s   sc                     t         r@d| j                  j                  j                  j                  v r| j                  ||||      S | j                  ||||      S )Ncuda)r   r   r{   r2   typer   r  )rZ   rE   r   r   rF   s        r&   r   zMamba2Mixer.forward  sS     "f0C0C0J0J0O0O&O,,]L.Zhii!!-~~^^r(   r   )rm   rn   ro   rp   r   rS   r[   r"   rs   r   rI   
LongTensorr   r  r   r   r   s   @r&   r   r      s   @| @ @J /35915`||` {+` !!1!12	`
 !.`L -126/3B%||B% {+B%   0 01	B%
 !.B%P /35915	_ {+	_ !!1!12		_
 !.	_r(   r   c                   &     e Zd Zd fd	Zd Z xZS )Mamba2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zM
        Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        Nrx   r}   s      r&   r[   zMamba2RMSNorm.__init__  s1     	ll5::k#:; #r(   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r   )	r3   rD   r"   r   r   r   r   r|   r{   )rZ   rE   r   r   s       r&   r   zMamba2RMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r(   r   r   r   s   @r&   r  r    s    $;r(   r  c                   t     e Zd Z fdZ	 	 	 ddee   deej                     deej                     fdZ	 xZ
S )Mamba2Blockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||      | _
        y )Nr   r\   )ry   r[   rJ   r\   residual_in_fp32r  rT   r   r   r   mixer)rZ   rJ   r\   r   s      r&   r[   zMamba2Block.__init__  sR    " & 7 7!&"4"4&:S:ST	 9=
r(   r   r   rF   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  ||||      }||z   }|S )Nr   r   r   rF   )r   rD   r{   r3   r  r"   r   r  )rZ   rE   r   r   rF   residuals         r&   r   zMamba2Block.forward  s     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^dr # 
 !=0r(   r   )rm   rn   ro   r[   r   rI   r"   r  rs   r   r   r   s   @r&   r  r    sO    > /35915 {+ !!1!12	
 !.r(   r  c                   0    e Zd ZU eed<   dZdgZdZdZd Z	y)Mamba2PreTrainedModelrJ   backboner  Tc                 	   | j                   j                  }t        |t              r+t	        j
                  d| j                   j                  dz         }|j                  j                  t	        j                  |             d|j                  _
        d|j                  _
        |j                  j                  j                  d       t	        j                  t	        j                  | j                   j                        t!        j                  | j                   j"                        t!        j                  | j                   j$                        z
  z  t!        j                  | j                   j$                        z         j'                  | j                   j(                        }|t	        j                  t	        j*                  |              z   }|j,                  j                  |       d|j,                  _        t0        j2                  j5                  |j6                  j8                  t!        j:                  d             |j6                  j<                  Tt?        |j6                  j<                  dd	      s3t0        j2                  jA                  |j6                  j<                         t0        j2                  j5                  |jB                  j8                  t!        j:                  d             | j                   jD                  rB|jB                  j8                  }|t!        j:                  | j                   jF                        z  }t        |t0        jH                        rt?        |j8                  dd	      s+t0        j2                  jK                  |j8                  |
       |j<                  Bt?        |j<                  dd	      s*t0        j2                  jA                  |j<                         yyyt        |tL        tN        f      r&|j8                  j                  j                  d       yt        |t0        jP                        r,t0        j2                  jK                  |j8                  |
       yy)zInitialize the weights.r   Tg      ?)min   )aN
_no_reinitF)std))rJ   initializer_range
isinstancer   r"   r   rQ   r   copy_r   r   r   datafill_r   randmathr   r   r   time_step_floorexpm1r   r$  r   initkaiming_uniform_r   r{   sqrtr   getattrzeros_r   rescale_prenorm_residualrW   r   normal_r  rv   	Embedding)rZ   moduler%  r   r   inv_dtps          r&   _init_weightsz#Mamba2PreTrainedModel._init_weights  s   kk++fk* Q 5 5 9:ALLuyy|,,0FLL)(,FHH%HHMM$

4;;00188DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN  ((,FNN%GG$$V]]%9%9TYYq\$J}}!!-v}}11<GGGNN6==#5#56GG$$V__%;%;tyy|$L{{33 OO**TYYt{{<<==fbii(6==,>37{{&v{{L%@GGNN6;;/ A ' 0A BCMM$$S)-GGOOFMMsO3 .r(   N)
rm   rn   ro   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr:  rt   r(   r&   r  r    s&    "&&*#L44r(   r  z-
    Class for the MAMBA2 model outputs.
    )custom_introc                   |    e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeeej                        ed<   y)Mamba2Outputa:  
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   rE   )rm   rn   ro   rp   rC  r   r"   FloatTensorr;  r   rI   rE   tuplert   r(   r&   rB  rB  	  sH     6:x 1 129*.L(;'.8<M8E%"3"345<r(   rB  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   y)Mamba2CausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   rE   )rm   rn   ro   rp   rH  r   r"   rD  r;  rI  r   rI   rE   rE  rt   r(   r&   rG  rG    s\    
 )-D(5$$
%,*.FHU&&'.*.L(;'.8<M8E%"3"345<r(   rG  c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee   dee	j                     dee	j                     deeef   fd       Z xZS )Mamba2Modelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )Nr  Fr   )ry   r[   r   r6  
vocab_sizerT   
embeddings
ModuleListrangerW   r  layersgradient_checkpointingr  r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rZ   rJ   idxr   s      r&   r[   zMamba2Model.__init__:  s     ,,v'8'8&:L:LMmmSXY_YqYqSr$sC[3%G$st&+##F$6$6F<U<UV//? %ts   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)rZ   
state_dictprefixargsks        r&   rU  zMamba2Model.load_hookF  s;     	Aq EO^^TUEV
199\=AB	r(   c                     | j                   S rg   rN  rk   s    r&   get_input_embeddingsz Mamba2Model.get_input_embeddingsL  s    r(   c                     || _         y rg   r`  rZ   new_embeddingss     r&   set_input_embeddingsz Mamba2Model.set_input_embeddingsO  s	    (r(   	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   rF   r_   c	                 8   ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|st        | j                   |j                  d      |j                  |j                        }t        j                  d| j                   j                  |j                        }n|t        d      d}|}
|rdnd}| j                  D ]  } ||
|||	      }
|s||
fz   } | j!                  |
      }
|r||
fz   }|st#        d
 |
||fD              S t%        |
|r||      S d|      S )a  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r1   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrt   r  c              3   &   K   | ]	  }||  y wrg   rt   ).0vs     r&   	<genexpr>z&Mamba2Model.forward.<locals>.<genexpr>  s     fqXYXefs   )rC  r   rE   )rJ   ri  r   rh  use_return_dict
ValueErrorrN  rR  rI   r8   r2   r3   r"   r   rM   rQ  rS  rE  rB  )rZ   rf  rg  r   rh  ri  rj  r   rF   kwargsrE   all_hidden_statesmixer_blocks                r&   r   zMamba2Model.forwardR  s   0 %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#*KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !;   L%"6BD;; 		IK')--	M $$58H$H!		I M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
r(   )NNNNNNNN)rm   rn   ro   r[   rU  ra  re  r   r   r"   r  rI   r<   rs   r   rE  rB  r   r   r   s   @r&   rK  rK  8  s    
)  1548.2$(/3&*5915P
E,,-P
   0 01P
 {+	P

 D>P
 'tnP
 d^P
 !!1!12P
 !.P
 
ul"	#P
 P
r(   rK  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    c                   z    e Zd Zg Z fdZd Zd Z	 	 	 	 	 ddee   dee	j                     dee	j                     fdZe	 	 	 	 	 	 	 	 	 ddee	j                     d	ee	j                     dee   d
ee	j                     dee   dee   dee   dee	j                     dee	j                     deeef   fd       Z xZS )Mamba2ForCausalLMc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
ry   r[   rK  r  r   r   rT   rM  lm_headrV  )rZ   rJ   r   s     r&   r[   zMamba2ForCausalLM.__init__  sF     #F+yy!3!3V5F5FUSr(   c                 6    | j                   j                         S rg   )r  ra  rk   s    r&   ra  z&Mamba2ForCausalLM.get_input_embeddings  s    }}1133r(   c                 8    | j                   j                  |      S rg   )r  re  rc  s     r&   re  z&Mamba2ForCausalLM.set_input_embeddings  s    }}11.AAr(   r   r   rF   c                    d|j                         i}|r|t        j                  d| j                  j                  j
                  |j                        }|d|i}|j                  d      }	n|j                  d      }	t        | j                  j                  |	| j                  | j                        }|r3|d   dkD  r+|d d df   j                  d      j                         |d<   d }|s|d|i}|j                  ||||d       |S )Nrf  r   r   rg  r1   r*   )r   rh  r   rF   )r   r"   r   r  rJ   rM   r2   r8   rI   r3   	unsqueezeupdate)
rZ   rf  rg  rh  r   r   rF   rr  model_inputsmax_batch_sizes
             r&   prepare_inputs_for_generationz/Mamba2ForCausalLM.prepare_inputs_for_generation  s    $Y%9%9%;<-
 #\\!T]]-A-A-M-MV_VfVfgN( /?!.!3!3A!6!*!2&t}}';';^TXT_T_gkgqgqrL*Q.(1!R%(8(B(B2(F(Q(Q(SL%!N]6+];L ,&"0"0		
 r(   rf  rg  labelsri  rj  rh  r_   c
           
         ||n| j                   j                  }| j                  ||||||||	      }|d   }| j                  |j	                  | j                  j
                  j                              j                         }d}|* | j                  d||| j                   j                  d|
}|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )ao  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        N)r   rg  ri  rj  rh  r   rF   r   )rI  r  rM  r   )rH  rI  r   rE   rt   )rJ   rp  r  rx  rD   r{   r3   r   loss_functionrM  rG  r   rE   )rZ   rf  rg  r   r  ri  rj  rh  r   rF   rr  mamba2_outputsrE   rI  rH  outputs                   r&   r   zMamba2ForCausalLM.forward  s	   8 &1%<k$++B]B]%'!5#)) ' 	
 'q)m..t||/B/B/H/HIJPPR%4%%pVFt{{OeOepiopDY!33F)-)9TGf$EvE#'44(66	
 	
r(   )NNNNN)	NNNNNNNNN)rm   rn   ro   _tied_weights_keysr[   ra  re  r   rI   r"   r  rs   r  r   rD  r<   r   rE  rG  r   r   r   s   @r&   rv  rv    sK    4B .25915(
 {+( !!1!12( !.(T  1559.2-1/3&*$(15158
E,,-8
   1 128
 {+	8

 ))*8
 'tn8
 d^8
 D>8
 !.8
 !.8
 
u**	+8
 8
r(   rv  )rv  rK  r  ):rp   r,  dataclassesr   typingr   r   r"   torch.utils.checkpointr   activationsr   
generationr	   modeling_layersr
   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_mamba2r   
get_loggerrm   r   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr   r   causal_conv1dr   r   allr   rs   rS   r'   r.   rB   rG   rI   Modulerv   r   r  r  r  rB  rG  rK  rv  __all__rt   r(   r&   <module>r     s     ! "    ! ) 9 - 
 W . 
		H	% RmmZjW?AWDD-7**!( VU\\ VS V
((J  J Z; ;${_")) {_|;BII ;", 8 ;4O ;4 ;4| =; = = =; = =& j
' j
 j
Z s
- s
s
l Hr(   