
    rh                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlm Z   ejB                  e"      Z# e       rddl$m%Z% ndZ% e       rddl&m'Z'm(Z( ddl)m*Z* nd\  Z*Z(Z' e       r	ddl+m,Z,m-Z- nd\  Z-Z, G d d      Z. G d de
j^                        Z0 G d de
j^                        Z1 G d de      Z2e G d d e             Z3e ed!"       G d# d$e                    Z4e ed%"       G d& d'e                    Z5e G d( d)e3             Z6 ed*"       G d+ d,e3e             Z7g d-Z8y).zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)PretrainedConfig)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNc                       e Zd ZdZdZej                  dfdededej                  de
ej                  edf   fdZd	ed
ej                  dej                  dej                  fdZd	edej                  fdZd Zy)
MambaCachea  
    Cache for mamba model which does not have attention mechanism and key value states.

    Arguments:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a smaller batch size is used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Example:

        ```python
        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> past_key_values = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values
        MambaCache()
        ```
    TNconfigmax_batch_sizedtypedevicec                    || _         || _        |j                  | _        |j                  | _        |j
                  | _        g | _        g | _        |t        j                  |      nd }t        |j                        D ]  }t        j                  | j                   | j                  | j                  || j                        }t        j                  | j                   | j                  | j                  || j                        }t        j                  j                  |       t        j                  j                  |       | j                  j!                  |       | j                  j!                  |        y )Nr#   r"   )r!   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr#   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr    r!   r"   r#   _
conv_state	ssm_states           {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mamba/modeling_mamba.py__init__zMambaCache.__init__^   s/    -!'!9!9$// & 2 2/1.0)/);f%v//0 	.A',{{##&&%%kk(J ',kk##&&##kk'I MM--j9MM--i8##J/OO""9-'	.    	layer_idxnew_conv_statecache_positionreturnc                 "   | j                   |   j                  |j                  k7  r5| j                   |   j                  |j                        | j                   |<   | j                   |   }|j                  d| j                  dz
        }|j                  dd      }|j                  |j                  |j                        |d d d d |f<   | j                   |   j                          | j                   |xx   |z  cc<   | j                   |   S )Nr   r   )shiftsdimsr%   )r,   r#   toclampr+   rollr"   zero_)r5   r<   r=   r>   r7   s        r9   update_conv_statezMambaCache.update_conv_state   s    
 I&--1F1FF*.*:*:9*E*H*HI^I^*_DY'%%i0
'--a1F1F1JK__BR_8
+9+<+<JDUDU]g]m]m+<+n
1a'(#))+#z1#	**r;   new_ssm_statec                     | j                   |   j                          | j                   |xx   |j                  | j                   |   j                        z  cc<   | j                   |   S N)r-   rG   rD   r#   )r5   r<   rI   s      r9   update_ssm_statezMambaCache.update_ssm_state   sT    	"((*	"m&6&6ty7Q7X7X&YY"y))r;   c                     t        t        | j                              D ]<  }| j                  |   j                          | j                  |   j                          > y rK   )r/   lenr,   rG   r-   )r5   r<   s     r9   resetzMambaCache.reset   sM    s4#3#345 	/IY'--/OOI&,,.	/r;   )__name__
__module____qualname____doc__is_compileabler.   float16r   intr"   r   r#   strr:   Tensor
LongTensorrH   rL   rO    r;   r9   r   r   ;   s    > N #]]15#. #. #. {{	#.
 ellC-.#.J++.3ll+LQL\L\+	+"*# *ell *
/r;   r   c            
       F    e Zd ZdZdedef fdZd Z	 	 	 ddej                  de
e   de
ej                     d	e
ej                     fd
Zdde
e   de
ej                     d	e
ej                     fdZ	 	 	 dde
e   de
ej                     d	e
ej                     fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r    r<   c           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                        | _
        || _        |j                  | _        t        j                  | j                  | j                  |j                  |j                  | j                  |j                  dz
        | _        |j                   | _        t$        |j                      | _        |j(                  | _        t        j*                  | j                  | j                  dz  |j,                        | _        t        j*                  | j                  | j                  | j
                  dz  z   d      | _        t        j*                  | j                  | j                  d      | _        t5        j6                  d| j
                  dz   t4        j8                        d d d f   }|j;                  | j                  d      j=                         }t        j>                  t5        j@                  |            | _!        t        j>                  t5        jD                  | j                              | _#        t        j*                  | j                  | j                  |j,                        | _$        |j,                  | _        | jK                          y )	Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r`   FTr"   rA   )&superr:   r    hidden_sizer(   r)   r*   r+   r'   rV   time_step_rankr<   use_conv_biasr   Conv1dconv1d
hidden_act
activationr
   actuse_mambapyLinearuse_biasin_projx_projdt_projr.   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projwarn_slow_implementation)r5   r    r<   A	__class__s       r9   r:   zMambaMixer.__init__   s1   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!-- yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!35==I$PQ'RHHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQWQ`Q`a%%'r;   c                     t        t        t        t        t        t
        f      }|sM| j                  r+t               rt        j                  d       y t        d      t        j                  d       y y )Na7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)allr   r   r   r   r   rp   r   loggerwarning_onceImportError)r5   is_fast_path_availables     r9   r   z#MambaMixer.warn_slow_implementation   sw    !$#%68HJ^`no"
 &')''F & Z  ##J &r;   hidden_statescache_paramsr>   attention_maskc                 	   | j                  |      j                  dd      }| j                  r%|"t        || j                  j
                  | j                  r| j                  j                  nd | j                  j
                  | j                  j
                  | j                  j
                  | j                  r$| j                  j                  j                         nd t        j                  | j                  j                                d d | j                   j                         | j                  j                  j                         d      }|S |j#                  dd      \  }}|||j%                  d      z  }| j                  j
                  j'                  | j                  j
                  j)                  d      | j                  j
                  j)                  d            }|m|d   dkD  ret+        |j-                  d      |j.                  | j0                     || j                  j                  | j2                        }|j%                  d      }n|Yt4        j6                  j9                  || j:                  |j<                  d   z
  df      }	|j?                  | j0                  |	|       tA        ||| j                  j                  | j2                        }|||j%                  d      z  }| j                  |j                  dd            }
t        jB                  |
| jD                  | jF                  | jF                  gd      \  }}}| j                  j
                  |j                  dd      z  }t        j                  | j                  j                                }tI        | j                  d	      r$| j                  j                  j                         nd }|e|d   dkD  r]tK        |jL                  | j0                     |d
   |d
   ||d d df   |d d df   | j                   |d
   |d
      j%                  d      }nptO        ||||j                  dd      |j                  dd      | j                   j                         ||dd
      \  }}|||jQ                  | j0                  |       | j                  |j                  dd            }|S )Nr   rd   T)
delta_biasdelta_softplusdimr   rA   )rn   r`   ).r   )dt_softplus)r   return_last_state))rs   	transposetrainingr   rl   weightrj   r`   rt   ru   r   rr   floatr.   expr|   r~   chunk	unsqueezeviewsizer   squeezer,   r<   rn   r   
functionalpadr+   shaperH   r   splitri   r)   hasattrr   r-   r   rL   )r5   r   r   r>   r   projected_statescontextualized_statesgateconv_weightsr,   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsr8   s                      r9   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward   s7     <<6@@AF==\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!p %$O #3"8"8"8"BM4) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;P^_ 0!<1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOIq! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$r;   c           	      X   |j                   \  }}}|j                  }| j                  |      j                  dd      }	|	j	                  dd      \  }
}||
|j                  d      z  }
||j                  | j                     j                         }|j                  |
j                        }|j                   d   | j                  k(  rt        j                  j                  |
| j                  |
j                   d   z
  df      }|j                  | j                  ||       | j!                  | j#                  |
      dd |f         }
n9|j                  | j                  |
|      }|j                  | j"                  j$                  j                        }t'        j(                  || j"                  j$                  d d dd d f   z  d      }
| j*                  r|
| j"                  j,                  z  }
| j!                  |
      j                  |      j                  d      }
n`t'        j.                  || j0                  | j2                  f|
j                  |      }| j!                  | j#                  |
      dd |f         }
||
|j                  d      z  }
| j5                  |
j                  dd            }t'        j6                  || j8                  | j2                  | j2                  gd      \  }}}| j;                  |      }t        j                  j=                  |      j                  dd      }t'        j>                  | j@                  jC                                }t'        j>                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jC                         z  }||
d d d d d d d f   jC                         z  }| jD                  r| jF                  r|tI        |j                  dd      |j                  dd            }||j                  d      z  jK                  d      j                  dd      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }ng }tO        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t'        jP                  |j                  |      |d d |d d f   j                  d            }|jS                  |d d d d df           t'        jT                  |d      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }|(|j                  | j                     jW                  |       | jY                  |j                  dd            }|S )	Nr   rd   r   r   rA   .r%   r	   )-r   r"   rs   r   r   r   r-   r<   clonerD   r#   r+   r   r   r   rH   ro   rl   r   r.   sumrj   r`   r1   r'   r)   rt   r   ri   ru   softplusr   r|   r   rp   r   r   r   r~   r/   matmulr4   stackcopy_r   )r5   input_statesr   r>   r   
batch_sizeseq_lenr6   r"   r   r   r   r8   r7   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   s                               r9   slow_forwardzMambaMixer.slow_forwardP  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~z>Z $])CC'M)R S);;DNNM[ij
']]4;;+=+=+D+DE
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB OT33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6KL7^ :&q!Qz2Y>!QPQST*AUU	#ll9<<+>!Q'
@T@TUW@XY##K1a$89:  ++l;K%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$r;   c                 2   t        t        t        t        t        t
        f      }|r^d| j                  j                  j                  j                  v r2t        j                  j                         s| j                  ||||      S | j                  ||||      S )Ncuda)r   r   r   r   r   r   rt   r   r#   typer.   r2   is_compilingr   r   )r5   r   r   r>   r   r   s         r9   forwardzMambaMixer.forward  s     "%#%68HJ^`no"
 "f0B0B0I0I0N0N&NW\WdWdWqWqWs,,]L.Zhii  nn]]r;   r   )rP   rQ   rR   rS   r   rV   r:   r   r.   rX   r   r   rY   r   r   r   __classcell__r   s   @r9   r\   r\      s   )({ )(s )(V4 .25959c%||c% z*c% !!1!12	c%
 !!1!12c%LO%x
7K O%aijojzjza{ O%  S[  \a  \l  \l  Sm O%j .25959^ z*^ !!1!12	^
 !!1!12^r;   r\   c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)rg   r:   r   rz   r.   r}   r   variance_epsilon)r5   rh   epsr   s      r9   r:   zMambaRMSNorm.__init__  s1     	ll5::k#:; #r;   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nrd   rA   T)keepdim)	r"   rD   r.   rw   powmeanrsqrtr   r   )r5   r   input_dtypevariances       r9   r   zMambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r;   c                 R    | j                   j                  d    d| j                   S )Nr   z, eps=)r   r   r   r5   s    r9   
extra_reprzMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEEr;   )gư>)rP   rQ   rR   r:   r   r   r   r   s   @r9   r   r     s    $;Fr;   r   c                   t     e Zd Z fdZ	 	 	 ddee   deej                     deej                     fdZ xZ	S )
MambaBlockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||      | _
        y )Nr   r<   )rg   r:   r    r<   residual_in_fp32r   rh   layer_norm_epsilonnormr\   mixer)r5   r    r<   r   s      r9   r:   zMambaBlock.__init__  sR    " & 7 7 !3!39R9RS	)<
r;   r   r>   r   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  ||||      }||z   }|S )Nrf   r   r>   r   )r   rD   r   r"   r   r.   rw   r   )r5   r   r   r>   r   residuals         r9   r   zMambaBlock.forward  s     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^dr # 
 !=0r;   r   )
rP   rQ   rR   r:   r   r   r.   rY   r   r   r   s   @r9   r   r     sQ    = .25959 z* !!1!12	
 !!1!12r;   r   c                   2    e Zd ZU eed<   dZddgZdZdZd Z	y)MambaPreTrainedModelr    backboner   r\   Tc                 &   | j                   j                  }t        |t              rCt	        j
                  d|j                  dz   t        j                        dddf   }|j                  |j                  d      j                         }|j                  j                  t	        j                  |             d|j                  _        d|j                  _        |j                  j                   j#                  d       | j                   j$                  dz  | j                   j&                  z  }| j                   j(                  dk(  r5t*        j,                  j/                  |j0                  j2                  |       nO| j                   j(                  d	k(  r6t*        j,                  j5                  |j0                  j2                  | |       t	        j6                  t	        j8                  | j                   j                        t;        j                  | j                   j<                        t;        j                  | j                   j>                        z
  z  t;        j                  | j                   j>                        z         jA                  | j                   jB                  
      }|t	        j                  t	        jD                  |              z   }|j0                  jF                  j                  |       d|j0                  jF                  _$        t*        j,                  jK                  |jL                  j2                  t;        jN                  d             |jL                  jF                  TtQ        |jL                  jF                  dd      s3t*        j,                  jS                  |jL                  jF                         t*        j,                  jK                  |jT                  j2                  t;        jN                  d             | j                   jV                  rB|jT                  j2                  }|t;        jN                  | j                   jX                        z  }t        |t*        jZ                        rtQ        |j2                  dd      s+t*        j,                  j]                  |j2                  |       |jF                  BtQ        |jF                  dd      s*t*        j,                  jS                  |jF                         yyyt        |t^              r&|j2                  j                   j#                  d       yt        |t*        j`                        r,t*        j,                  j]                  |j2                  |       yy)zInitialize the weights.r   rf   NrA   Tg      ?g      constantrandom)min   )a
_no_reinitF)std)1r    initializer_range
isinstancer\   r.   rv   r)   rw   rx   r'   ry   r|   r   r{   _no_weight_decayr~   datafill_ri   time_step_scaletime_step_init_schemer   init	constant_ru   r   uniform_r   randmathtime_step_maxtime_step_minrE   time_step_floorexpm1r`   r   kaiming_uniform_rl   sqrtgetattrzeros_r   rescale_prenorm_residualr0   rq   normal_r   	Embedding)r5   moduler   r   dt_init_stddtinv_dtps           r9   _init_weightsz"MambaPreTrainedModel._init_weights  s   kk++fj) Q 5 5 9OPTVWPWXA1126AACALLuyy|,,0FLL)(,FHH%HHMM$++44d:T[[=X=XXK{{00J>!!&.."7"7E22h>  !6!6kR

4;;88988DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f--1FNN*GG$$V]]%9%9TYYq\$J}}!!-v}}11<GGGNN6==#5#56GG$$V__%;%;tyy|$L{{33 OO**TYYt{{<<==fbii(6==,>37{{&v{{L%@GGNN6;;/ A ' -MM$$S)-GGOOFMMsO3 .r;   N)
rP   rQ   rR   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr  rZ   r;   r9   r   r     s)    "%|4&*#L:4r;   r   z,
    Class for the MAMBA model outputs.
    )custom_introc                   |    e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeeej                        ed<   y)MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r   )rP   rQ   rR   rS   r  r   r.   FloatTensorr	  r   r   r   tuplerZ   r;   r9   r  r  &  sH     6:x 1 129)-L(:&-8<M8E%"3"345<r;   r  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   y)MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r   )rP   rQ   rR   rS   r  r   r.   r  r	  r  r   r   r   r  rZ   r;   r9   r  r  :  s\    
 )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<r;   r  c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee   dee	j                     dee	j                     deeef   fd       Z xZS )
MambaModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )Nr   Fr   )rg   r:   r   r  
vocab_sizerh   
embeddings
ModuleListr/   r0   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)r5   r    idxr   s      r9   r:   zMambaModel.__init__U  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$r3Z#%F$rs&+#"6#5#56;T;TU//? %ss   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)r5   
state_dictprefixargsks        r9   r"  zMambaModel.load_hooka  s;     	Aq EO^^TUEV
199\=AB	r;   c                     | j                   S rK   r  r   s    r9   get_input_embeddingszMambaModel.get_input_embeddingsg  s    r;   c                     || _         y rK   r-  r5   new_embeddingss     r9   set_input_embeddingszMambaModel.set_input_embeddingsj  s	    (r;   	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr>   r   r?   c	                 8   ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|st        | j                   |j                  d      |j                  |j                        }t        j                  d| j                   j                  |j                        }n|t        d      d}|}	|rdnd}
| j                  D ]  } ||	|||	      }	|s|
|	fz   }
 | j!                  |	      }	|r|
|	fz   }
|st#        d
 |	||
fD              S t%        |	|r||
      S d|
      S )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r%   r#   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrZ   r   c              3   &   K   | ]	  }||  y wrK   rZ   ).0vs     r9   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>  s     fqXYXefs   )r  r   r   )r    r6  r   r5  use_return_dict
ValueErrorr  r  r   r   r#   r"   r.   rv   r*   r  r   r  r  )r5   r3  r4  r   r5  r6  r7  r>   r   r   all_hidden_statesmixer_blocks               r9   r   zMambaModel.forwardm  s   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !;   L%"6BD;; 		IK')--	M $$58H$H!		I M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
r;   )NNNNNNNN)rP   rQ   rR   r:   r"  r.  r2  r   r   r.   rY   r   boolr   r  r  r   r   r   s   @r9   r  r  S  s    
)  1548-1$(/3&*5959L
E,,-L
   0 01L
 z*	L

 D>L
 'tnL
 d^L
 !!1!12L
 !!1!12L
 
uk!	"L
 L
r;   r  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       e Zd ZdgZ fdZd Zd Z	 ddedee	e
f   dedee	e
f   fd	Z	 	 	 	 	 dd
ee   deej                      deej                      fdZe	 	 	 	 	 	 	 	 	 ddeej                      deej                      deej&                     d
ee   deej                      dee   dee   dee   deej*                     deeef   fd       Z xZS )MambaForCausalLMzlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFre   )
rg   r:   r  r   r   rq   rh   r  lm_headr#  )r5   r    r   s     r9   r:   zMambaForCausalLM.__init__  sF     "6*yy!3!3V5F5FUSr;   c                 6    | j                   j                         S rK   )r   r.  r   s    r9   r.  z%MambaForCausalLM.get_input_embeddings  s    }}1133r;   c                 8    | j                   j                  |      S rK   )r   r2  r0  s     r9   r2  z%MambaForCausalLM.set_input_embeddings  s    }}11.AAr;   outputsmodel_kwargsnum_new_tokensr?   c                    |j                  dd       |d<   |j                  dd      rd|v r|d   |d   dd  |z   |d<   d|v r?|d   }t        j                  ||j                  |j                  d   df      gd	      |d<   |S )
Nr   r5  Tr>   rA   r   r   r   r   )getr.   catnew_onesr   )r5   rI  rJ  rK  kwargsr   s         r9   #_update_model_kwargs_for_generationz4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H^$[$/ L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* r;   r   r>   r   c                    d|j                         i}|r|t        j                  d| j                  j                  j
                  |j                        }|d|i}|j                  d      }	n|j                  d      }	t        | j                  j                  |	| j                  | j                        }|r3|d   dkD  r+|d d df   j                  d      j                         |d<   d }|s|d|i}|j                  ||||d       |S )Nr3  r   r9  r4  r%   rA   )r   r5  r>   r   )ry   r.   rv   r   r    r*   r#   r   r   r"   r   update)
r5   r3  r4  r5  r   r>   r   rP  model_inputsr!   s
             r9   prepare_inputs_for_generationz.MambaForCausalLM.prepare_inputs_for_generation  s    $Y%9%9%;<-
 #\\!T]]-A-A-M-MV_VfVfgN( /?!.!3!3A!6!*!2%dmm&:&:NSWS^S^fjfpfpqL*Q.(1!R%(8(B(B2(F(Q(Q(SL%!N]6+];L ,&"0"0		
 r;   r3  r4  labelsr6  r7  r5  c
           
         ||n| j                   j                  }| j                  |||||||	|      }|d   }| j                  |j	                  | j                  j
                  j                              j                         }d}||j	                  |j                        }|dddddf   j                         }|dddf   j                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r   r4  r6  r7  r5  r>   r   r   .rA   r   )r  r  r   r   )r    r>  r   rF  rD   r   r"   r   r#   ry   r   r   r   r  r   r   )r5   r3  r   r4  r   rV  r6  r7  r5  r>   rP  mamba_outputsr   r  r  shift_logitsshift_labelsloss_fctoutputs                      r9   r   zMambaForCausalLM.forward  s_   2 &1%<k$++B]B]%'!5#)) & 	
 &a(m..t||/B/B/H/HIJPPRYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
r;   )r   )NNNNN)	NNNNNNNNN)rP   rQ   rR   _tied_weights_keysr:   r.  r2  r   dictrW   r   rV   rQ  r   r   r.   rY   rU  r   r  rB  rX   r   r  r  r   r   r   s   @r9   rD  rD    s    ++4B YZ"26sCx.RU	c3h, -15959(
 z*( !!1!12( !!1!12(T  155959-1-1/3&*$(15<
E,,-<
 !!1!12<
   1 12	<

 z*<
 ))*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
r;   rD  )rD  r  r   r   )9rS   r   dataclassesr   typingr   r   r   r.   torch.utils.checkpointr   torch.nnr   activationsr
   configuration_utilsr   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   configuration_mambar   
get_loggerrP   r   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r   r   Moduler\   r   r   r   r  r  r  rD  __all__rZ   r;   r9   <module>rr     s     ! ' '    % ! 3 ) 9 - 
 k j , 
		H	%#EXR@P=-~DD-7**b/ b/JN^ N^bF299 F(+ 8 A4? A4 A4H 
=+ = = 
=+ = =& f
% f
 f
R J
+_ J
J
Z Sr;   