
    rh                     ,   d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)  e'       rddl*m+Z+m,Z, ddl-m.Z. nd\  Z.Z,Z+ e&       r	ddl/m0Z0m1Z1 nd\  Z1Z0 e2e.e,e0e1e+f      Z3 e$jh                  e5      Z6 G d de	jn                        Z8dejr                  de:dejr                  fdZ; G d de      Z<	 d?d e	jn                  d!ejr                  d"ejr                  d#ejr                  d$eejr                     d%e=d&e=fd'Z> G d( d)e	jn                        Z? G d* d+e	jn                        Z@ G d, d-e	jn                        ZA G d. d/e	jn                        ZB G d0 d1e	jn                        ZC G d2 d3e	jn                        ZDe# G d4 d5e             ZEe# G d6 d7eE             ZF G d8 d9eEe      ZG e#d:;       G d< d=eE             ZHg d>ZIy)@zPyTorch Zamba model.    N)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                   ,     e Zd Zd fd	Zd Zd Z xZS )ZambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.pyr(   zZambaRMSNorm.__init__@   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   hidden_statesinput_dtypevariances       r2   forwardzZambaRMSNorm.forwardH   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler,   shaper-   r.   s    r2   
extra_reprzZambaRMSNorm.extra_reprO   s*    ))*+6$2G2G1HIIr3   )gư>)__name__
__module____qualname__r(   rA   rF   __classcell__r1   s   @r2   r%   r%   ?   s    $;Jr3   r%   r>   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rD   expandreshape)r>   rL   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvrU   T   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   c                      e Zd ZdZdZdZdZej                  dfdZ	d Z
dedeej                  ej                  f   fdZ	 dd	ej                  d
ej                  dedeeeef      deej                  ej                  f   f
dZdej(                  fdZddee   defdZdeeej                     eej                     f   fdZeddeeeej2                           ddfd       Zy)ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    NFc           
      ,   || _         d| _        |j                  | _        d| _        |j                  |j
                  z  | _        |j                  | _        |j                  | _
        |j                  | _        g | _        g | _        g | _        i | _        i | _        i | _        t%        |j&                        D ]  }| xj                  t)        j*                  || j                  | j                  ||      gz  c_        || j                  | j                  | j                  z  | j                  f}| xj                  t)        j*                  |||      gz  c_        | j                  |   dk(  s| j                  j-                  |        t%        |j&                        D cg c]  }t)        j.                  g g|z  |       c}| _        t%        |j&                        D cg c]  }t)        j.                  g g|z  |       c}| _        y c c}w c c}w )NFdevicer8   hybridrZ   )r8   is_compileablelayers_block_typehas_previous_statemamba_expandr/   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr*   zerosappendtensor	key_cachevalue_cache)r.   config
batch_sizer8   rZ   icache_shape_s           r2   r(   z ZambaHybridDynamicCache.__init__r   s   
#!'!9!9"'!'!4!4v7I7I!I$22 & 3 3#11"$v//0 	2AJ(>(>@U@U^dlqr!  ""&&$*<*<<##	K OOKe TUUO%%a(H4''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts    "H!"Hc                 ,    t        | j                        S N)lenrr   rE   s    r2   __len__zZambaHybridDynamicCache.__len__   s    4>>""r3   	layer_idxrM   c                 >    | j                   |   | j                  |   fS rz   )rr   rs   r.   r}   s     r2   __getitem__z#ZambaHybridDynamicCache.__getitem__   s!    ~~i($*:*:9*EEEr3   
key_statesvalue_statescache_kwargsc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr6   r   r5   dim)rr   rD   rs   r*   cat)r.   r   r   r}   r   s        r2   updatezZambaHybridDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr3   beam_idxc                    t        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   V y)zDReorders the cache for beam search, given the selected beam indices.r   N)	rm   r{   rr   rZ   index_selectr9   rs   rg   rh   )r.   r   r}   rZ   s       r2   reorder_cachez%ZambaHybridDynamicCache.reorder_cache   sD   s4>>23 		iI^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI&		ir3   c                     || j                   vr| j                   d   n|}t        | j                        |k  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )ri   r{   rr   rD   r   s     r2   get_seq_lengthz&ZambaHybridDynamicCache.get_seq_length   sR     3<4CZCZ2ZD++A.`i	t~~)+~~i(..r22r3   c                     t        d      Nz@ZambaHybridDynamicCache does not have a legacy cache equivalent.NotImplementedErrorrE   s    r2   to_legacy_cachez'ZambaHybridDynamicCache.to_legacy_cache   s    !"deer3   past_key_valuesr   c                     t        d      r   r   )clsr   s     r2   from_legacy_cachez)ZambaHybridDynamicCache.from_legacy_cache   s    !"deer3   rz   )r   )rG   rH   rI   __doc__rr   rs   r]   r*   float16r(   r|   intrC   Tensorr   r   dictstrr   r   
LongTensorr   r   r   classmethodFloatTensorr    r3   r2   rW   rW   `   sM    IKN16t u@#FS FU5<<3M-N F 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F$ie&6&6 i3 3c 3fuU\\':E%,,<O'O!P f fuUEVEV?W9X0Y fes f fr3   rW   modulequerykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr5   r   r   r6   )r   r8   )ptrainingr   )rU   num_key_value_groupsr*   matmul	transposerD   r   
functionalsoftmaxr:   r9   r8   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightscausal_maskattn_outputs                r2   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   c                        e Zd ZdZdedef fdZ	 ddej                  dede	ej                     de	e
   dee   d	eej                  e	ej                     e	eej                        f   fd
Z xZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    rt   r}   c                 .   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  |j                  z  | _	        |j                  | _
        | j                  dz  dz  | _        d| _        |j                  | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j&                  d      | _        y )Nr5         TFbias)r'   r(   rt   r}   attention_hidden_sizeattention_head_dimrT   num_attention_headsrR   r   max_position_embeddingsr   	is_causalattention_dropoutr   Linearq_projk_projv_projr/   o_projr.   rt   r}   r1   s      r2   r(   zZambaAttention.__init__   s9   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr3   r>   r   past_key_valuer   rM   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
||j                  |	|
|      \  }	}
t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }| j#                  |      }||fS )Nr6   r   r5   eager        )r   r   )rD   rT   r   viewr   r   r   r   r   rt   _attn_implementationr   r   r   r   rP   r   r   )r.   r>   r}   r   r   r   input_shapehidden_shapequery_statesr   r   attention_interfacer   r   s                 r2   rA   zZambaAttention.forward  su    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST%'5'<'<ZW`'a$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r3   rz   )rG   rH   rI   r   r   r   r(   r*   r   r   rW   r   r   rC   rA   rJ   rK   s   @r2   r   r      s    l{ ls l. =A#)||#) #) !.	#)
 !!89#) -.#) 
u||Xell3XeELL>Q5RR	S#)r3   r   c                   l     e Zd ZdZdef fdZ	 d	dej                  defdZ	d	defdZ
d	defdZ xZS )
ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    rt   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j"                  | _        t'        j(                  | j                  | j                  | j                   | j                  | j                  | j                  dz
        | _        |j,                  | _        t0        |j,                     | _        |j4                  | _        t'        j8                  | j                  | j                  dz  | j$                        | _        t'        j<                  t?        j@                  | j                  | j                  | j                  dz  z   | j                              | _!        t'        j<                  t?        j@                  | j                  | j                  | j                        dz
  dz  | j                  dz  z        | _"        t'        j<                  t?        j@                  | j                  | j                              | _#        t?        jH                  d| j                  dz   t>        jJ                        d d d f   }|jM                  | j                  d      jO                         }t'        j<                  t?        jP                  |      jS                  | j                  | j                  d            | _*        t'        j<                  t?        jV                  | j                  | j                              | _,        t'        j8                  | j                  | j                  | j$                        | _-        t\        st^        ja                  d       y y )	Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr5   r   g      ?r8   r6   ap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r'   r(   rt   r}   r/   rb   rc   rd   re   r`   ra   mamba_dt_ranktime_step_rankrf   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr)   r*   ro   x_proj_weightdt_proj_weightdt_proj_biasaranger:   rO   r   logrP   A_logr+   Dout_projis_fast_path_availableloggerwarning_once)r.   rt   r}   Ar1   s       r2   r(   zZambaMambaMixer.__init__6  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\KK""##d&9&9A&==##
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_%^ &r3   r>   cache_paramsc                    |j                   \  }}}|d uxr |j                  xr |dk(  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }}	|j                  d      j                         }|	j                  d      }	|	j                  || j                  d|      j                  dd      }	| j                  j                  j	                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                   | j"                        }|j%                  d      }n|,t'        j(                  |dk(        s||j%                  d      z  }|dt*        j,                  j/                  || j0                  |j                   d   z
  df      }|j                  | j                     j3                  |       t5        ||
| j                  j                   | j"                        }|,t'        j(                  |dk(        s||j%                  d      z  }|j                  d| j                  | j6                  |      j                  dd      }| j8                  d d d d d d d f   |z  j                  dd      }t'        j:                  || j<                  | j>                  | j>                  gd      \  }}}| j@                  d d d f   |j                  dd      z  }t'        jB                  | jD                  jG                                }| jH                  | jH                  jG                         nd }t'        jJ                  |d|f|jL                  |jN                        }|rtQ        | j                        D ]  }tS        |jT                  | j                     d d |f   ||d	df   ||d	df   ||   ||d d df   ||d d df   | jV                  |   |	|d	df   ||   d

      j%                  d      }t'        jX                  ||fd      } nAt'        jJ                  |d| j6                  | j>                  f|jL                  |jN                        }tQ        | j                        D ]  }t[        ||   ||   ||   ||   j                  dd      ||   j                  dd      | jV                  |   jG                         |	|   ||   d
d

      \  }}t'        jX                  ||fd      j                         }t'        jX                  ||j%                  d      fd      } |*|(|jT                  | j                     j3                  |       | j]                  |j                  dd            }|S )Nr   r5   r6   r   r   )r   r   rY   .T)dt_softplus)delta_softplusreturn_last_state)/rD   r_   r   r   r   chunksqueezer   rP   rf   r   r,   sizer"   rg   r}   r   r   	unsqueezer*   allr   r   padre   copy_r!   r   r   splitr   rc   r   expr   floatr   emptyrZ   r8   rm   r    rh   r   r   r   r   )r.   r>   r   r   ru   seq_lenrx   use_precomputed_statesprojected_statesgateconv_weightsrg   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r2   cuda_kernels_forwardz$ZambaMambaMixer.cuda_kernels_forwards  s    "/!4!4
GQ!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M)%))Na<O2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM)%))Na<O2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. O 6 ++DNN;AqDA!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FANO  Q 3 3T5H5HI$++#))I
 4--. S,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	S $)A''7==iH !%l.D.DQ.J K$$r3   c           
         |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }	}
|	j                  d      j                         }	|
j                  d      }
|
j                  || j                  d|      j                  dd      }
t        |t              }|r|j                  | j                     j                   d   |k(  r| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j!                  |	j"                        }|j$                  r|dk(  r|j&                  | j                     j                   d   |k(  r|j&                  | j                     }t)        j*                  |dd      }|	d d d d df   |d d d d df<   ||j&                  | j                  <   t)        j,                  || j.                  j0                  d d dd d f   z  d      }	| j2                  r|	| j.                  j4                  z  }	| j7                  |	      j!                  |      j9                  d      }	n|Ct)        j:                  |dk(        s+|	|d d |	j                   d    d f   j9                  d      z  }	t<        j>                  jA                  |	| jB                  |	j                   d   z
  df      }||j&                  | j                  <   | j7                  | j/                  |	      dd |f         }	|t)        j:                  |dk(        s|	|d d |	j                   d    d f   j9                  d      z  }	nt)        jD                  || j                  | jF                  | jH                  f|	j"                  |      }|,t)        j:                  |dk(        s|	|j9                  d      z  }	| j7                  | j/                  |	      dd |f         }	|,t)        j:                  |dk(        s|	|j9                  d      z  }	|	j                  d| j                  | jF                  |      j                  dd      }	| jJ                  d d d d d d d f   |	z  j                  dd	      }t)        jL                  || jN                  | jH                  | jH                  gd      \  }}}| jP                  d d d f   |j                  dd	      z  | jR                  d d d d d d f   z   }t<        j>                  jU                  |      }t)        jV                  | jX                  j[                                }t)        jV                  |d d d d d d d d f   |d d d d d d d d d f   z        }|d d d d d d d d d f   |d d d d d d d d d f   j[                         z  }||	d d d d d d d d d f   j[                         z  }g }t]        |      D ]  }|d d d d d d |d d f   j                  dd      |z  |d d d d d d |d d f   j                  dd      z   }t)        j^                  |j                  dd      j!                  |      |d d d d |d d f   j9                  d            }|ja                  |d d d d d d df           t)        jb                  |d      }||	| jd                  d d d d d d f   z  z   }|| j7                  |
      z  }|r||j                  | j                  <   | jg                  |j                  dd      j                  |d|      j                  dd            }|S )
Nr   r5   r6   r   r   )shiftsdims.rY   r   )4rD   r8   r   r   r   r   r   r   rP   rf   
isinstancerW   rh   r}   r   cloner9   rZ   r_   rg   r*   rollsumr   r,   r   r   r   r   r  r   r   r  re   ro   r   rc   r   r  r   r   r   softplusr  r   r  rm   r   rp   stackr   r   )r.   input_statesr   r   ru   r  rx   r8   r
  r>   r  	use_cacher  
conv_stater  r  r  r  r  r   
discrete_A
discrete_BdeltaB_ur  rv   scan_outputr  s                              r2   slow_forwardzZambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX|-DE	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I //qL ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O!-eiiRS@S6T$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}t?T?TWdWjWjkmWn?npq>rs
;E((8 $])CC'M)R S!-eiiRS@S6T$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nMT//1D1DdFYFYZ$++I
 )%))Na<O2P -0H0H0K K HHT[[%?XgX%NOM)%))Na<O2P -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGw 	9A"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78	9 kk,B7!]TVVAtQ<L5M%MN!DHHTN26?L##DNN3 !%!!!Q'//
BHRRSTVWX!
 %$r3   c                     | j                   rGt        r"d| j                  j                  j                  vrt        d      | j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   rZ   type
ValueErrorr  r*  )r.   r>   r   r   s       r2   rA   zZambaMambaMixer.forward1  sm      )V4;M;M;T;T;Y;Y-Y i 
 ,,]LYg,hh  ^ \\r3   r#   )rG   rH   rI   r   r   r(   r*   r   rW   r  r*  rA   rJ   rK   s   @r2   r   r   )  sX    
;{ ;| im_%"\\_%9P_%B[%7N [%z	]3J 	]r3   r   c                   $     e Zd Z fdZd Z xZS )ZambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r'   r(   rt   r/   ra   r   r   	gate_projup_proj	down_projr   
hidden_actact_fnr.   rt   r1   s     r2   r(   zZambaMLP.__init__?  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r3   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rz   )r5  r7  r3  r4  )r.   xr5  s      r2   rA   zZambaMLP.forwardI  s6    NN4;;t~~a/@#ADLLQRO#ST	r3   )rG   rH   rI   r(   rA   rJ   rK   s   @r2   r0  r0  >  s    0r3   r0  c                       e Zd Zddedee   f fdZ	 	 	 	 ddej                  dej                  dedeej                     dee	   dee
   d	ee
   d
ee   deej                  eeej                  ej                  f      f   fdZ xZS )ZambaAttentionDecoderLayerrt   r}   c                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _        y )Nr0   )r'   r(   r   	self_attnr0  feed_forwardr%   r   rms_norm_epsinput_layernormr/   pre_ff_layernormr   s      r2   r(   z#ZambaAttentionDecoderLayer.__init__O  s_    '	:$V,+F,H,HfNaNab ,V-?-?VEXEX Yr3   r>   original_hidden_statesr   r   output_attentionsr$  r   rM   c           
          t        j                  ||gd      }| j                  |      } | j                  d||||||d|\  }}	| j	                  |      }| j                  |      }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r6   r   )r>   r}   r   r   rE  r$  r   )r*   concatenaterB  r?  rC  r@  )r.   r>   rD  r}   r   r   rE  r$  r   self_attn_weightsoutputss              r2   rA   z"ZambaAttentionDecoderLayer.forwardW  s    > ))=:P*QWYZ,,];+94>> ,
'))/,
 ,
(( --m<))-8 ")++Gr3   rz   )NNFF)rG   rH   rI   r   r   r   r(   r*   r   rW   boolr   r   rC   r   rA   rJ   rK   s   @r2   r<  r<  N  s    Z{ Zx} Z 26<@,1$)3||3 !&3 	3
 !.3 !!893 $D>3 D>3 -.3 
u  (51B1BEDUDU1U+V"WW	X3r3   r<  c                   t    e Zd Zdedef fdZ	 	 	 	 	 	 	 	 	 ddej                  deej                     dee   deej                     deej                     dee	   d	ee
   d
ee
   deej                     deej                     deej                  eeej                  ej                  f      f   fdZ xZS )ZambaMambaDecoderLayerrt   r}   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        || _        y )N)rt   r}   r>  )	r'   r(   r   mambar%   r/   rA  rB  r}   r   s      r2   r(   zZambaMambaDecoderLayer.__init__  s>    $FiH
+F,>,>FDWDWX"r3   r>   rD  r   r   r   rE  r$  cache_positiontransformer_hidden_statesrM   c                     |}|
||
z   n|}| j                  |      }| j                  |||      }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)r>   r   r   )rB  rN  )r.   r>   rD  r}   r   r   r   rE  r$  rO  rP  r   residualrH  rI  s                  r2   rA   zZambaMambaDecoderLayer.forward  s    < !
 :S9^M55dq 	 ,,];

'') # 
 ! !=0 ")++G((Gr3   )	NNNNNFFNN)rG   rH   rI   r   r   r(   r*   r   r   rW   rJ  r   rC   r   rA   rJ   rK   s   @r2   rL  rL    s   #{ #s # :>#'15.2<@,1$)59<@:||: !) 6: C=	:
 !.: ell+: !!89: $D>: D>: !!1!12: $,ELL#9: 
u  (51B1BEDUDU1U+V"WW	X:r3   rL  c                   l    e Zd Zdedej
                  def fdZ	 	 	 	 	 	 	 	 ddej                  de
ej                     de
e   de
ej                     d	e
ej                     d
e
e   de
e   de
e   de
ej                     deej                   e
eej                   ej                   f      f   fdZ xZS )ZambaHybridLayershared_transflinearrN  c                 L    t         |           || _        || _        || _        y rz   )r'   r(   rU  rV  mamba_decoder)r.   rU  rV  rN  r1   s       r2   r(   zZambaHybridLayer.__init__  s%    *"r3   r>   rD  r}   r   r   r   rE  r$  rO  rM   c
           
          | j                  ||||||||	      }
|
d   }|r|
d   }| j                  |      }| j                  |||||||	      }
|r|
d   f|
dd z   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )rD  r}   r   r   rE  r$  rO  r   r   )rP  r   r   rE  r$  rO  r5   N)rU  rV  rX  )r.   r>   rD  r}   r   r   r   rE  r$  rO  layer_outputsrP  rH  s                r2   rA   zZambaHybridLayer.forward  s    > **#9&)/) + 	
 %2!$4! -a 0$(KK0I$J!**&?))/) + 
 *1-/@AMRSRTDUUMr3   )NNNNNFFN)rG   rH   rI   r<  r   r   rL  r(   r*   r   r   r   rW   rJ  r   rC   r   rA   rJ   rK   s   @r2   rT  rT    s   #&@ #")) #\r # :>#'15.2<@,1$)59>||> !) 6> C=	>
 !.> ell+> !!89> $D>> D>> !!1!12> 
u  (51B1BEDUDU1U+V"WW	X>r3   rT  c                   >    e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZd Zy	)
ZambaPreTrainedModelrt   modelTr<  rL  r   Fc                 T   | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t              r&|j                  j                  j                  d       y t        |t              r|j                   j                  j                  d|       | j                   j"                  dz  }t        j$                  j'                  |j(                  | |       | j                   j*                  | j                   j,                  z  | j                   j.                  z  }t1        j2                  t1        j4                  | j                   j.                  |      t7        j8                  | j                   j:                        t7        j8                  | j                   j<                        z
  z  t7        j8                  | j                   j<                        z         j?                  | j                   j@                        }|t1        j8                  t1        jB                  |              z   }|jD                  j                  jG                  |       t1        jH                  d|jJ                  dz   t0        jL                        d d d f   }|jO                  |jP                  d      jS                         }|jT                  j                  jG                  t1        j8                  |      jW                  |j.                  |jX                  d             |jZ                  j                  j                  d       y y )	Nr   )r<   stdg      ?r   )minr   r   r6   ).rt   initializer_ranger  r   r   r   r,   datanormal_r   zero_	Embeddingpadding_idxr%   fill_r   r   r   inituniform_r   r`   r/   rf   r*   r  randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1r   r  r   rc   r:   rO   ra   r   r   rP   r   r   )r.   r   r_  dt_init_stdr   dtinv_dtr   s           r2   _init_weightsz"ZambaPreTrainedModel._init_weights%  s   kk++fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)0  %%--3C-@++33T9KGGV22[L+N![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F$$**62Q 5 5 9OPTVWPWXA1126AACALL##EIIaL$8$89M9MvOdOdfh$ijHHMM$% 1r3   N)rG   rH   rI   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrt  r   r3   r2   r\  r\    s;    &*#57OP"3 NL%r3   r\  c                   ,    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     d	ee   d
ee   dee   dee   deej                     deeef   fd       Zd Z xZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    rt   c           
         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        |      }g }g }|j                  | _
        t        |j                        D ]  }|j                  |   dk(  r|j                  t        ||             2|j                  |   dk(  sE|j                  t        j                  | j                   j                  | j                   j                  d             |j                  t        ||              t#        |      }t#        |      }g }g | _        t'        | j                        D ]  \  }}|dk(  r_d| d}	g d}
g | j$                  |
D cg c]  }|	|z   	 c}| _        |j                  t)        |t+        |      t+        |                   j|j                  t+        |              t        j,                  |      | _        |j0                  | _        t3        |j                  |j4                  	      | _        d| _        | j;                          y c c}w )
NrN  )r}   r[   Fr   zlayers..)	z%shared_transf.self_attn.q_proj.weightz%shared_transf.self_attn.k_proj.weightz%shared_transf.self_attn.v_proj.weightz%shared_transf.self_attn.o_proj.weightz+shared_transf.feed_forward.gate_proj.weightz)shared_transf.feed_forward.up_proj.weightz+shared_transf.feed_forward.down_proj.weightz$shared_transf.input_layernorm.weightz%shared_transf.pre_ff_layernorm.weightr>  )r'   r(   pad_token_idrf  
vocab_sizer   re  r/   embed_tokensr<  r^   rm   rn   rp   rL  r   rt   iter_tied_weights_keys	enumeraterT  next
ModuleListlayersr   r%   rA  final_layernormgradient_checkpointing	post_init)r.   rt   blockmamba_layerslinear_layersrv   r  layer_id
layer_typeprefix_name	tied_keysr   r1   s               r2   r(   zZambaModel.__init__O  s&    !.. ++LL):):F<N<NPTP`P`a*62!'!9!9v//0 	QA''*g5##$:6Q$OP))!,8$$RYYt{{/F/FH_H_fk%lm##$:6Q$OP	Q L)]+"$$-d.D.D$E 	2 HjX% 'z3
	 +pD,C,C*odmFn]`{UXGXFn*o'.ud=6I4P\K]^_d<01#	2$ mmF+$*$?$?!+F,>,>FDWDWX&+# Gos   .I7	input_idsr   position_idsr   inputs_embedsr$  rE  output_hidden_statesreturn_dictrO  rM   c                 d   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}t        j                  |      }|r|t        j                  d       |
.t        j                  |j                  d   |j                        }
||
j!                  d      }| j#                  |||
      }|rdnd }|rdnd }t%        | j&                        D ]r  \  }}|r||fz  }| j                  r1| j                  r%| j)                  |j*                  |||||||||

      }n ||||||||||
		      }|d   }|sd|d   j||d   fz  }t | j-                  |      }|r||fz  }|r|j.                  sd
|_        t1        ||r|nd ||      }|	r|S |j3                         S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r\   r   r   )rD  r}   r   r   r   rE  r$  rO  T)last_hidden_stater   r>   
attentions)rt   rE  r  r$  use_return_dictr.  r  r   r   r   r  r*   r  r   rD   rZ   r   _update_causal_maskr  r  _gradient_checkpointing_func__call__r  r_   r   to_tuple)r.   r  r   r  r   r  r$  rE  r  r  rO  r>   rD  r   all_hidden_statesall_self_attnsr}   layerrZ  outputs                       r2   rA   zZambaModel.forward~  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0:
 !"\\-*=*=a*@I]I]^N)33A6L..~}n]"6BD0d )$++ 6 "	:Iu#!m%55!**t}} $ A ANN!*"#%"! !&!+A'#1 +#2&7'#1
! *!,M  #/"}Q'7&99NE"	:H ,,];  -!11?#E#E15O.(+/8Od+%	
 %v;&//*;;r3   c                    | j                   j                  dk(  r	|d|v r|S y |j                  |j                  }}t	        j
                  |      j                  }|j                  d   }|d   dz   }t	        j                  ||f|||      }	|dk7  rt	        j                  |	d      }	|	t	        j                  ||      |j                  dd      kD  z  }	|	d d d d d d f   j                  |j                  d   ddd      }	||	j                         }	|j                         d	k(  rd|j                  d   }
|	d
d |
f   j                  d      |d d d d d d f   j                  d      z  }|	d
d |
f   j!                  ||      |	d
d |
f<   | j                   j                  dk(  r0|.|j                  j"                  dv rt%        j&                  |	|      }	|	S )Nflash_attention_2r   r   r6   )
fill_valuer8   rZ   )diagonalr\   r   r5   .sdpa)r,  xpunpu)rt   r   r8   rZ   r*   finfor`  rD   fulltriur   rP   rO   r  r   eqmasked_fillr-  r   _unmask_unattended)r.   r   input_tensorrO  r8   rZ   	min_dtypesequence_lengthtarget_lengthr   mask_lengthpadding_masks               r2   r  zZambaModel._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/=!Ai_dmsta**[1=Ku||M&ANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\hjs1tC+-. KK,,6*%%**.DD
 1CCKQZ[Kr3   
NNNNNNNNNN)rG   rH   rI   r   r   r(   r   r   r*   r   r   rW   r   rJ  r   rC   r   rA   r  rJ   rK   s   @r2   r~  r~  F  s   -{ -^  151537=A59$(,0/3&*59k<E,,-k< !.k< u//0	k<
 ""9:k<   1 12k< D>k< $D>k< 'tnk< d^k< !!1!12k< 
u--	.k< k<\!r3   r~  c                       e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   d	ee	j                     d
ee	j                     dee   dee   dee   dee   dee	j                     deee	j                  f   deeef   fd       Z	 	 	 	 	 	 ddZ xZS )ZambaForCausalLMrt   c                 $   t         |   |       t        |      | _        dg| j                  j                  | _        |j
                  | _        t        j                  |j                  |j
                  d      | _	        | j                          y )Nzlm_head.weightFr   )r'   r(   r~  r]  r  r  r   r   r/   lm_headr  r8  s     r2   r(   zZambaForCausalLM.__init__  so     '
#3"Tdjj6S6S"T ++yy!3!3V5F5FUS 	r3   c                     || _         y rz   r]  )r.   decoders     r2   set_decoderzZambaForCausalLM.set_decoder  s	    
r3   c                     | j                   S rz   r  rE   s    r2   get_decoderzZambaForCausalLM.get_decoder   s    zzr3   r  r   r  r   r  labelsr$  rE  r  r  rO  logits_to_keeprM   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
| j	                  ||||||||	||

      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   r  r   r  r$  rE  r  rO  r  r   r   losslogitsr   r>   r  )rt   rE  r  r  r]  r  r   slicer  loss_functionr  r   r   r>   r  )r.   r  r   r  r   r  r  r$  rE  r  r  rO  r  r   rI  r>   slice_indicesr  r  r  s                       r2   rA   zZambaForCausalLM.forward#  sL   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPDY,F'+'7D7V#CVC%#33!//))
 	
r3   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nr6   r   r   )r8   rZ   r  r  )r  r   r$  r   r  rO  )rD   rW   rt   r8   rZ   longcumsummasked_fill_r   r   num_logits_to_keep)r.   r  r   r   r  rO  r  r$  r   empty_past_kvmodel_inputss              r2   prepare_inputs_for_generationz.ZambaForCausalLM.prepare_inputs_for_generationu  sc    (4/  )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	5Y__Q/tzz$++O %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r3   )NNNNNNNNNNNr   )NNNNNT)rG   rH   rI   r   r(   r  r  r   r   r*   r   r   rW   r   rJ  r   r   rC   r   rA   r  rJ   rK   s   @r2   r  r    sk   {   151537=A59-1$(,0/3&*5934O
E,,-O
 !.O
 u//0	O

 ""9:O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
 c5<</0O
 
u,,	-O
 O
h 9r3   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	e
eej                     f      deej                     deej                     dee   d	ee   d
ee   dee   de	eef   fd       Z xZS )ZambaForSequenceClassificationc                    t         |   |       |j                  | _        t        |      | _        | j                  j
                  | _        t        j                  |j                  | j                  d      | _	        | j                          y r2  )r'   r(   
num_labelsr~  r]  r  r   r   r/   scorer  r8  s     r2   r(   z'ZambaForSequenceClassification.__init__  se      ++'
"&**"?"?YYv114??O
 	r3   r  r   r  r   r  r  r$  rE  r  r  rM   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|
s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r  r   r  r$  rE  r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r6   rY   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r\   
regressionsingle_label_classificationmulti_label_classificationr  )rt   r  r]  r  rD   r  r.  r9   rZ   r*   int32r   argmaxr   r   r1   rG   problem_typer  r8   r  r   r
   r   r	   r   r   r   r   r>   r  )r.   r  r   r  r   r  r  r$  rE  r  r  transformer_outputsr>   r  ru   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r2   rA   z&ZambaForSequenceClassification.forward  s   ( &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r3   r  )rG   rH   rI   r(   r   r   r*   r   r   r   r   listr   rJ  rC   r   rA   rJ   rK   s   @r2   r  r    s     151537KO59-1$(,0/3&*[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
r3   r  )r  r  r~  r\  )r   )Jr   rk  typingr   r   r   r   r*   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr   r   configuration_zambar   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater    causal_conv1dr!   r"   r  r   
get_loggerrG   r   Moduler%   r   r   rU   rW   r  r   r   r   r0  r<  rL  rT  r\  r~  r  r  __all__r   r3   r2   <module>r     sH  (   1 1    A A ! . ) > B q q F & , T , XR@P=-~DD-7**.0@BVXfg 
 
		H	%J299 J*	UU\\ 	U# 	U%,, 	Uffe ff` %II%<<% 
% <<	%
 U\\*% % %4C)RYY C)LQ]bii Q]jryy  < <~ARYY AHEryy EP )%? )% )%X G% G GV\+_ \~ g
%9 g
g
T gr3   