
    rh             *          d Z ddlmZ ddlmZmZ ddlZddlmc m	Z
 ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ  e       rddlmZ ddlmZmZ ddlmZ dZn4ddl m!Z! ddlm"Z"m#Z# ddlm$Z$ e%ejL                  ejL                  ejL                  f   Z'e(e)e'f   ZdZd`dejL                  deee*ejL                  f      dejL                  fdZ	 	 	 	 	 	 	 	 	 dadejL                  dejL                  dejL                  dejL                  dejL                  dejL                  d ejL                  d!ejL                  d"ejL                  d#ejL                  d$ee*   d%e)d&e)de%ejL                  ejL                  ejL                  f   fd'Z+	 	 	 dbd)ejL                  dejL                  dejL                  dejL                  dejL                  d ejL                  dejL                  dejL                  d$e*d%e)d&e)d*e*de%ejL                  ejL                  ejL                  f   fd+Z,	 	 	 	 	 	 	 	 dcd,ejL                  d-ejL                  d.ejL                  d/ejL                  d0ejL                  d1ejL                  d2ejL                  d3ejL                  d$ee*   d4e-d5e-d%e)d*e*de%ejL                  ejL                  ejL                  ee%ejL                  ejL                  ejL                  f      ee%ejL                  ejL                  ejL                  f      f   fd6Z.	 	 	 	 	 	 ddd,ejL                  d-ejL                  d.ejL                  d/ejL                  d0ejL                  d7ejL                  d8ejL                  d9ejL                  d4e-d*e*d%e)deejL                  e%ejL                  e%ejL                  ejL                  ejL                  f   f   f   fd:Z/d(ej`                  fd,ejL                  d-ejL                  d.ejL                  d/ejL                  d0ejL                  d1ejL                  d2ejL                  d3ejL                  d*e*d;ejb                  de%ejL                  e%ejL                  ejL                  ejL                  f   f   fd<Z2ddddd(ej`                  fd,ejL                  d-ejL                  d.ejL                  d/ejL                  d0ejL                  d7ejL                  d8ejL                  d9ejL                  d4e-d*e*d;ejb                  de%ejL                  ejL                  ejL                  ee%ejL                  ejL                  ejL                  f      ee%ejL                  ejL                  ejL                  f      f   fd=Z3ddddd(ejh                  dfd>e"d,ejL                  d-ejL                  d.ejL                  d0ejL                  d/ejL                  d7ejL                  d8ejL                  d9ejL                  d4e-d*e*d?ejb                  d%e)deejL                  e%ejL                  e%ejL                  ejL                  ejL                  f   f   f   fd@Z5ddddd(ejh                  ddfd>e"dAe"dBe"d,ejL                  d-ejL                  d.ejL                  d0ejL                  d/ejL                  d7ejL                  d8ejL                  d9ejL                  d4e-d*e*d?ejb                  d%e)dCe-deejL                  e%ejL                  e%ejL                  ejL                  ejL                  f   f   f   f"dDZ6 G dE dFejn                        Z8 G dG dHejn                        Z G dI dJejn                        Z9 G dK dLejn                        Z: G dM dNejn                        Z; G dO dPejn                        ZdQ Z<dR Z= G dS dTe      Z> G dU dV      Z?ee G dW dXe                    Z@e G dY dZe>             ZAee G d[ d\e                    ZBe G d] d^e>e             ZCg d_ZDy)ezPyTorch xLSTM Model.    )	dataclass)OptionalUnionN)nn)CrossEntropyLoss   )GenerationMixin)PreTrainedModel)ModelOutputauto_docstringcan_return_tupleis_xlstm_available   )xLSTMConfig)
mLSTMBlock)mLSTMStateTypesoft_cap)xLSTMRMSNormT)partial)CallableLiteral)round_up_to_next_multiple_ofFvalues	cap_valuereturnc                 @    || S |t        j                  | |z        z  S )a  
        Soft caps a tensor to a value.

        Performs a tanh operation on the logits and scales the result to the cap value. Common technique in attention
        and output language heads to prevent large logits from dominating the softmax. See for example Gemma2:
        https://arxiv.org/abs/2408.00118

        Args:
            values: The tensor to cap.
            cap_value: The value to cap the values to. If None, no cap is applied.

        Returns:
            The capped values.
        )torchtanh)r   r   s     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/xlstm/modeling_xlstm.pyr   r   0   s(     M5::fy&8999    @   matKmatVvecBvecImatC_statesvecN_statesscaMinter_statesmatC_initialvecN_initialscaMinter_initialqk_scale
chunk_size
num_chunksc                    g | j                   |j                   d   \  }}}}}|}| j                  | j                  }}|
|dz  }
|"t        j                  |||dz   |z  |f||      }|!t        j                  |||dz   |z  f||      }|t        j                  |||dz   f||      }|t        j                  ||||f||      n|}|t        j                  |||f||      n|}|	t        j                  ||df||      n|	}|d   |z
  |z   }|d   }|j                  d      j                  }|j                  d      }t        d|      D ]Q  }||d d d d ||z  |dz   |z  d d f<   ||d d d d ||z  |dz   |z  f<   ||d d d d |f<   |d d d d |f   }|d d d d |f   }t        j
                  ||z   |      }| d d d d ||z  |dz   |z  d d f   }|d d d d ||z  |dz   |z  d d f   } |d d d d |d d f   }!t        j                  |!|d   z
        d d d d d d d f   }"||"z  }#t        j                  ||z   |z
        d d d d d f   }$|$d   |z  |#j                  d	d      | z  z   }%|$|z  |#j                  d	d      j                  d      z   }&|}|%}|&}T ||d d d d | d d d f<   ||d d d d | d f<   ||d d d d df<   |||fS )
N      r   dtypedevice).r0   N).r0   r   ).N)shaper3   r4   r   zerosmaxr   squeezerangeexp	transposesum)'r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   
batch_sizenh_dhqkdhhvnc_dtype_devicematC_kvecN_kscaM_inter_kvecAscaGscaA_maxkey
scaA_max_kscaG_kscaM_inter_k_next
matK_chunk
matV_chunkvecA_k	vecAbar_kmatK_chunk_gated	scaGbar_kmatC_k_nextvecN_k_nexts'                                          r   mlstm_chunkwise_recurrent_fw_CrX   C   s    )D(CTZZ^(C%
B4**dkkTzH ++z2Q$&MU[dklK++z2Q$&Gv^efK#${{JR!V+EV\cd
 # KKRt4F7S 	 R^QeEKKR.fWMkw 	
 !( KKR+6'J" 	
 M"T)D0G}88B<&&#++B/J' 	!C CIK1cDjC!Gt+;;Q>??EK1cDjC!Gt+;;;<*6Q3Y' "!Q),J!Q)_F %		&<*? LaC*$4a:7M$MqPQJaC*$4a:7M$MqPQJ!QQ,'F		&+<Y+G"GHAqRVWI)I5		&<"7:K"KLQPQSWZXI $I.7:J:T:TUWY[:\`j:kkK $f,/?/I/I"b/Q/U/UVX/YYK -L F F?	!D (.Aq4%&!O$$*Aq4%&L!%1Ar"K)999r    ư>matQepsc                 H   | j                   }|
|	}	}|j                  \  }}}}|j                  |||||z  |      }|j                  |||||z        }|}| j                  ||||	|      } |j                  ||||	|      }|j                  ||||	|      }t        j                  t        j
                  |	|	ft        j                  |            }|d d d d d d d d d f   |d d d d d d d d d f   z
  }t        j                  ||t        d             }||d d d d d d d d d f   z   }t        j                  |dd      j                  }||d d d d d d d f   z   }t        j                  ||      }|d d d d d d d d d f   }|d d d d d d d d d f   }||z
  }t        j                  |      }| |j                  dd      z  |z  }||z  }t        j                  ||z
        } | | z  |z  }!|!|z  ||z  z   }"|!|j                  d      z  |j                  dd      z   }#t        j                  t        j                   |#      t        j                  |             }$|"|$|z   z  }%|%j                  ||||	z  |      }&|$j#                  ||||	z        }' |||||	z        }(|&|'|(fS )Nr2   infr0   Fdimkeepdimr5   T)r4   r6   viewr   trilonesboolwherefloatr8   r   maximumr;   r<   	unsqueezer=   absreshape))rZ   r"   r#   r&   r'   r(   r%   r$   r,   r-   r.   r[   rE   rC   r>   r?   dqkdhvmatC_k_statesvecN_k_statesscaMinter_k_statesltrmatF_logsig_chunkmatF_logsig_mask_chunkmatLogD_chunkvecMintra_kvecM_b_intervecM_k_combinematLogD_stabilized_chunk
matD_chunk
matS_chunk
matM_chunkvecBbarmatQ_chunk_gatedmatNumerator_commonvecDenom_l_commonvecDenom_max_commonmatH_k_chunkmatH_outvecN_outvecM_outs)                                            r   mlstm_chunkwise_parallel_fw_Hr      s    ++#ZJ#.#4#4 
BS#((RSBYL#((RSBYG-yyRZ=yyRZ=yyRZ=jjJJZ(jj
 !Aq!T!12T!Q4:J5KK!&S2CeEl]!S.aAtQ6F1GG ii2uELL 0Aq$??|[A'1aD(89#Aq!Q$45#0>#A YY78
T^^B33x?
*,
 ))L>9:'>H4.>dARR,}/F/Fr/JJZ^^`blp^Mqq#mmEII6G,H%))UcTcJde*.AC.GH$$ZR*_cJ '..z2rJO!*b"z/B8++r    queryrL   valueigatefgatecstatenstatemstatereturn_last_statesreturn_all_statesc                 d   | j                   \  }}}}||z  dk7  rt        d| d| d      ||z  }|j                  ||||      }|j                  ||||      }|j                  |      }|j	                  d      }||dz  }t        ||||||||||
      \  }}}t        | |||d d d d d | d d f   |d d d d d | f   |d d d d d df   ||||||      \  }}}|||f}|	r3||d d d d | d d d f   |d d d d | d f   |d d d d dd f   ffz  }n|d	z  }|
r||||ffz  }|S |d	z  }|S )
Nr   Sequence length   is not divisible by chunk size .r0   r1   
r"   r#   r$   r%   r)   r*   r+   r,   r-   r.   rZ   r"   r#   r&   r'   r(   r%   r$   r,   r-   r.   r[   N)r6   
ValueErrorra   
logsigmoidcumsumrX   r   )r   rL   r   r   r   r   r   r   r,   r   r   r-   r[   r>   r?   sequence_lengthrA   rC   r%   vecFvecF_logsigr$   rm   rn   ro   r   r   r   	ret_tuples                                r   mlstm_chunkwise_fwr      s   * 16-
BZ'1,//@@`ak`llmnoo
*zz*b"j9zz*b"j9 &&t,!!"%TzH <Z$!<
8}&8 (E%aFdUFAo6%aFdUFl3/1crc	:!(
$(H x2	q!dUVQ/q!dUV|1LN`abdegigjajNkl I  I=-9KLNNI   Ir    	c_initial	n_initial	m_initialc                 .   | j                   \  }}}}||
z  dk7  rt        d| d|
 d      ||
z  }|j                  ||||
      }|j                  ||||
      }t        j                  |      }|j                  d      }|dz  }t        |||||||||
|
      \  }}}t        | |||d d d d d | d d f   |d d d d d | f   |d d d d d df   ||||
||	      \  }}}|d d d d | d d d f   |d d d d | d f   |d d d d dd f   f}|r||fS |S )	Nr   r   r   r   r0   r1   r   r   )r6   r   ra   Fr   r   rX   r   )r   rL   r   r   r   r   r   r   r   r[   r-   kwargsr>   r?   r   rA   rC   r%   r   r   r$   r,   rm   rn   ro   r   r   r   last_statess                                r   mlstm_chunkwise_native_autogradr   ;  s    16-
BZ'1,//@@`ak`llmnoo
*zz*b"j9zz*b"j9 ll4(!!"%: <Z""'!<
8}&8 (E%aFdUFAo6%aFdUFl3/1crc	:!(
$(H %QD561_5}QD56\7RTfghjkmompgpTqr[((Or    dtype_statec
           	         | j                   }|j                  |	      }|j                  |	      }|j                  |	      }| j                  \  }}}|j                  \  }}}| j                  |j                  k7  rt        d      |j                  ||||fk7  rt        d|j                         |j                  |||fk7  rt        d|j                         |j                  ||dfk7  rt        d|j                         |j                  ||dfk7  rt        d|j                         |j                  ||dfk7  rt        d|j                         t        j
                  j                  j                  |      }t	        j                  ||z   |      }t	        j                  ||z   |z
        }t	        j                  ||z
        }| |d	z  z  }|d
d
d
d
d
d
d
f   |z  |d
d
d
d
d
d
d
f   |d
d
d
d
d
d
d
f   |d
d
d
d
d
d
d
f   z  z  z   }||z  ||z  z   }|d
d
d
d
d
d
d
f   |j                  |      z  }|j                  d      j                  |	      }|d
d
d
d
d
d
d
f   |d
d
d
d
d
d
d
f   j                  |      z  }|j                  d      }t	        j                  |       }t	        j                  |j                         |      |z   j                  |	      }||z  }|j                  |      }|j                  |	      }|j                  |	      }|j                  |	      }||||ffS )z?This is a single step of the mLSTM operation in recurrent form.r3   z&query and key must have the same shapezmatC_old has wrong shape, got zvecN_old has wrong shape, got r   zscaM_old has wrong shape, got zscaI has wrong shape, got zscaF has wrong shape, got r1   N   )r3   tor6   r   r   r   
functionalr   r8   r;   r9   rg   ri   ) r   rL   r   r   r   r   r   r   r[   r   r   	dtype_qkvmatC_oldvecN_oldscaM_oldr>   r?   rA   r@   rB   scaF_logscaM_state_newscaF_actscaI_actvecQ_scaledmatC_state_newvecN_state_newh_numqn_dotproductmax_valh_denomhs                                    r   mlstm_recurrent_step_nativer   |  sG    KK	99;9/99;9/99;9/${{
B[[
1d;;#))#EFF>>j"dD99=hnn=MNOO>>j"d33=hnn=MNOO>>j"a00=hnn=MNOO;;:r1--9%++GHH;;:r1--9%++GHH 88&&11%8 8h#6>99X0>AB99U^34t~.!!Q4-08;hq!QPT}>U1aq!T1}!55?
 
 "H,x#~=Aq$M*^->->Y->-OOa ##+#6#Aq$M2^Aq!TM5R5U5U\e5U5ff%--a0))^O,==!2!2!4g>DHH{H[GODDyD!'***='***='***=>>>BBBr    c                    | j                   \  }}}}|j                   d   }| j                  }|U||t        d      ||t        d      |j                  |
      |j                  |
      |j                  |
      }}}nRt	        j
                  ||||f|
|      }t	        j
                  |||f|
|      }t	        j
                  ||df|
|      }g }t        |      D ]w  }|d d d d |d f   |d d d d |d f   }}| d d d d |d d f   |d d d d |d d f   |d d d d |d d f   }}}t        d	|||||||||	|
d
|\  }\  }}}|j                  |       y t	        j                  |d      }|r||||ffS |S )
Nr0   z)Initial states must be provided together.r   r2   r   )
r   r   r   r   rL   r   r   r   r[   r   r5   r_    )
r6   r4   r   r   r   r7   r:   r   appendstack)r   rL   r   r   r   r   r   r   r   r[   r   r   r>   r?   r   rA   rl   r4   
matC_state
vecN_state
vecM_state	vecH_listtvecF_tvecI_tvecQ_tvecK_tvecV_tvecHmatHs                                 r   mlstm_recurrent_sequence_nativer     s   ( 16-
Bkk"o  I$5 !LMM I$5 !LMM;/;/;/ %/
J j"dC%@\bcJj"d%;;W]^Jj"a%8TZ[J	' 	#A"1aD=15Aq$3GFF &+1aA:%6Aq!QJqRSUVXYzIZFFF :U :!!!': :6D6:z: T"+	#. {{9"-*j*===Kr    mlstm_chunkwise_kernelautocast_kernel_dtypec                    |	rt        dd      |j                  \  }}}}|}||z  dk7  r||z   dz
  |z  |z  }|j                  ||||j                  d         }|j                  ||||j                  d         }|j                  ||||j                  d         }|j                  |||      }|j                  |||      }||d d d d d |d d f<   ||d d d d d |d d f<   ||d d d d d |d d f<   ||d d d d d |f<   ||d d d d d |f<   n
|}|}|}|}|} | d|||||||||	|
||d|}|d d d d d |d d f   }|S )Nz6We are padding zeros, so we cannot return last states,z*as they would be not the true last states.r   r   r   )r   rL   r   r   r   r   r   r   r   r[   r   r-   r   )r   r6   	new_zeros)r   r   rL   r   r   r   r   r   r   r   r[   r   r-   r   r>   r?   r   rA   
S_unpaddedS_paddedq_padk_padv_padi_padf_padr   s                             r   wrap_chunkwise_pad_zerosr     s     H< 
 16-
B$
Z'1,(:59jHJVHOOJHekk!nMEMM*b(CIIaLIEOOJHekk!nMEOOJH=EOOJH=E*/E!QQ&'*-E!QQ&'*/E!QQ&'',E!Q#$',E!Q#$EEEEE% 
1"7!
 
 Aq+:+q()r    mlstm_sequence_kernelmlstm_step_kernelenable_loggingc                 v   |j                   \  }}}}|j                   d   }||n2t        j                  |||||j                  t        j                        }|	|	n1t        j                  ||||j                  t        j                        }|
|
n1t        j                  ||d|j                  t        j                        }|dkD  r~g }d}||z
  }||z  }|dkD  r||z  }||z   } | |d||ddf   j                         |d||ddf   j                         |d||ddf   j                         |d||f   j                         |d||f   j                         ||||d||      \  }\  }}}||z  }|j                  |       ||z
  }|dkD  r ||d||ddf   j                         |d||ddf   j                         |d||ddf   j                         |d||f   j                         |d||f   j                         |||d|	
      \  }\  }}}|j                  |       t        j                  |d
      }nh|dk7  rt        d| d       ||j                  d
      |j                  d
      |j                  d
      ||||||	      \  }\  }}}|dddddddf   }|r||||ffS |S )af
  This function computes the last hidden state and matH outputs of the mLSTM, independently of the sequence length.

        For this it uses three kernels:
        - mlstm_chunkwise_kernel: mlstm chunkwise kernels that processes chunks of a given chunk size in parallel.
        - mlstm_sequence_kernel: mlstm kernel that processes the remaining sequence length in a single step recurrence.
        - mlstm_step_kernel: mlstm kernel that processes a sequence length of 1 in a single step.

        It tries to maximize the chunksizes to improve performance.
        It will start with the given chunk size and then divides the chunksize by 2 until the chunk size is smaller than 16.
        At every chunksize it will process the maximal number of chunks that fit into the remaining sequence length.

        E.g. for chunk_size = 64, this function will try the chunksizes [64, 32, 16] if necessary.

        For the remaining sequence length, which is smaller than 16, we use a different kernel that computes the mLSTM
        in a single step and loop over this in pytorch.

        Args:
            mlstm_chunkwise_kernel: The mLSTM chunkwise kernel that processes chunks of a given chunk size in parallel
            mlstm_sequence_kernel: The mLSTM kernel that processes the remaining sequence length in a single step recurrence
            query: The query tensor (batch_size, nh, sequence_length, dhqk)
            key: The key tensor (batch_size, nh, sequence_length, dhqk)
            value: The value tensor (batch_size, nh, sequence_length, dhhv)
            fgate: The forget gate tensor (batch_size, nh, sequence_length)
            igate: The input gate tensor (batch_size, nh, sequence_length)
            c_initial: The initial cell state tensor (batch_size, nh, dhqk, dhhv)
            n_initial: The initial hidden state tensor (batch_size, nh, dhqk)
            m_initial: The initial memory state tensor (batch_size, nh, 1)
            return_last_states: If True, the function will return the last states of the mLSTM
            eps: The epsilon value used for numerical stability
            autocast_kernel_dtype: The dtype used for the kernel computation
            chunk_size: The chunk size used for the chunkwise kernel
            enable_logging: If True, the function will log debug information. Default is False.

        Returns:
            The last hidden state tensor (batch_size, nh, sequence_length, dhhv) or a tuple containing the last hidden state tensor and the last states of the mLSTM
            Last states are (cstate (batch_size, nh, dhqk, dhhv), nstate (batch_size, nh, dhqk), mstate (batch_size, nh, 1)).
        r0   Nr4   r3   r   r   .T)r   rL   r   r   r   r   r   r   r-   r   r   r[   )
r   rL   r   r   r   r   r   r   r   r[   r   r   z)Received empty sequence (sequence_length=z3), require at least single element in the sequence.)	r   rL   r   r   r   r   r   r   r[   )
r6   r   r7   r4   float32
contiguousr   concatenater   r9   )r   r   r   r   rL   r   r   r   r   r   r   r   r[   r   r-   r   r>   r?   r   rA   rB   c_staten_statem_stateh_outsseq_len_start_idxremaining_seq_lenr.   iter_seq_lenseq_len_idxh_outs                                  r   (wrap_chunkwise_arbitrary_sequence_lengthr   B  sb   p 14		-
B{{2 $ ZT4

RWR_R_` 	 $ ZT#**EMMZ 	 $ ZQszzW 	 QF ! /2C C*j8JA~)J6/,>5K%6{%BA EFQQSC!2;!>ABMMO%6{%BA EFQQS%6{%B BCNNP%6{%B BCNNP%%%)'+*?622' "\1!e$ /2C C 1$5J%6%F IJUUWC!2?!BAEFQQS%6%F IJUUW%6%F FGRRT%6%F FGRRT%%%'+622' e$%%f!4E !# ??P  QD  E  2CmmA&KKNmmA&
2.E.GWg !Qa-(E7GW555Lr    c                       e Zd ZdZeZdef fdZ	 	 	 	 	 ddej                  dej                  dej                  dej                  dej                  d	ej                  d
ej                  dej                  de	de
ed      deej                  eej                  eej                  ej                  ej                  f   f   f   fdZdefdZ xZS )xLSTMBackendzxLSTM Backend Module for PyTorch.

        This module wraps the xLSTM kernels and provides a high-level interface for training and inference.
        configc                    t         |           || _        t        | _        t
        | _        t        | _        t        t        | j                  t        | j                  t        t        |j                              t        | j                  t        t        |j                              |j                  |j                  t        t        |j                         d      | _        t        | j                  t        t        |j                         |j                  |j                        }d|j$                  v rt        t&        |      }|| _        y )N)r   T)r   r   r   r-   r[   r   r   )r   r[   r-   with_padding)r   )super__init__r   r   chunkwise_kernel_fnr   sequence_kernel_fnr   step_kernel_fnr   r   getattrr   inference_state_dtyper-   r[   r   _inference_fnmoder   	_train_fn)selfr   train_kernel_fn	__class__s      r   r   zxLSTMBackend.__init__  s   G DK'FD$&ED#"=D!(8'+'?'?&-++ 'v/K/K L' #*'' 'v/K/K L# ",,JJ&-eV5Q5Q&R#'"D" &((&-eV5Q5Q&RJJ!,,	O ,")*B[j"k,DNr    r   rL   r   r   r   r   r   r   r   r   )train	inferencer   c                 h   |
| j                   j                  }
d|
v rX|	| j                   j                  }	| j                   j                  dk(  r|	rt        d      | j	                  |||||||||		      S d|
v r| j                  ||||||||      S t        d| j                   j                         )a  Forward pass of the mLSTM backend.

            Depending on the configured mode, this method will call the appropriate kernel function.

            Args:
                query: The query tensor of shape (batch_size, nh, sequence_length, dhqk).
                key: The key tensor of shape (batch_size, nh, sequence_length, dhqk).
                value: The value tensor of shape (batch_size, nh, sequence_length, dhhv).
                igate: The input gate preactivation tensor of shape (batch_size, nh, sequence_length).
                fgate: The forget gate preactivation tensor of shape (batch_size, nh, sequence_length).
                c_initial: The initial cell state tensor of shape (batch_size, nh, dhqk, dhhv).
                                                    Defaults to None.
                n_initial: The initial hidden state tensor of shape (batch_size, nh, dhqk). Defaults to None.
                m_initial: The initial memory tensor of shape (batch_size, nh, 1). Defaults to None.
                return_last_states: Whether to return the last states of the sequence. Defaults to None.
                                                    If None, the value from the config is used.

            Returns:
                hidden states of shape (batch_size, nh, sequence_length, dhhv)
                hidden states and last states the last states are the cell state cstate (batch_size, nh, dhqk, dhhv),
                the normalizer state nstate (batch_size, nh, dhqk), and the max state mstate (batch_size, nh, 1)
            r   train_with_paddingzFreturn_last_states=True is not supported with train_with_padding mode.)	r   rL   r   r   r   r   r   r   r   r   r   rL   r   r   r   r   r   r   zUnknown mode: )r   r   r   r   r   r   )r   r   rL   r   r   r   r   r   r   r   r   s              r   forwardzxLSTMBackend.forward  s    F |{{''$%-)-)G)G&;;##';;)()qrr~~''''9 & 
 
 $))''' * 	 	 !>$++2B2B1C!DEEr    c                     | j                    S r   r   r   s    r   
extra_reprzxLSTMBackend.extra_reprJ  s    kk]#r    )NNNFN)__name__
__module____qualname____doc__r   config_classr   r   Tensorrd   r   r   r   tupler  strr  __classcell__r   s   @r   r   r     s%   	
 # 	-;  	-R '+&*&*',<@G	F<<G	F G	F <<	G	F
 <<G	F <<G	F ||G	F ||G	F ||G	F !%G	F 7#789G	F 5<<u||U5<<W\WcWc;c5d'd!eefG	FR	$ 	$r    r   c                        e Zd ZdZ	 	 	 	 ddededededef
 fdZdej                  d	ej                  fd
Z
dej                  d	ej                  fdZdej                  d	ej                  fdZ xZS )r   a3  Root mean square normalization layer implementation similar
        to https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html.

        It normalizes the input tensor by the root mean square of the last dimension.

        Args:
            num_features: The number of features in the input tensor.
            eps: A small value to avoid division by zero.
            use_weight: Whether to use a learnable weight.
            use_bias: Whether to use a learnable bias.
            force_float32_reductions: Whether to force float32 reductions.
        num_featuresr[   
use_weightuse_biasforce_float32_reductionsc                 (   t         |           || _        || _        || _        |r.t        j                  t        j                  |            | _	        nd | _	        |r.t        j                  t        j                  |            | _        y d | _        y r   )r   r   r  r[   r  r   	Parameterr   rc   weightr7   bias)r   r  r[   r  r  r  r   s         r   r   zxLSTMRMSNorm.__init__[  sm     G ,DDH,DD) ll5::l+CD"LL\)BC	 	r    xr   c                 r    | j                   || j                   z  }| j                  || j                  z   }|S r   r  r  r   r  s     r   _apply_weight_biaszxLSTMRMSNorm._apply_weight_biasr  5    {{&Oyy$		MHr    c                     |j                   }| j                  r|j                         }|t        j                  |j                  d      j                  dd      | j                  z         z  }|j                  |      S )Nr   r0   Tr^   )	r3   r  rf   r   rsqrtpowmeanr[   r   )r   r  in_dtypes      r   _rms_normalizezxLSTMRMSNorm._rms_normalizey  s^    wwH,,GGIEKKa"d Cdhh NOOA44>!r    c                 J    | j                  |      }| j                  |      }|S r   )r&  r  r  s     r   r  zxLSTMRMSNorm.forward  s'    ##A&A''*AHr    rY   TFT)r  r	  r
  r  intrf   rd   r   r   r  r  r&  r  r  r  s   @r   r   r   M  s    	  #"-1	!	! 	! 		!
 	! '+	!.	 	 		"ELL 	"U\\ 	"	U\\ 	ell 	r    r   c                        e Zd ZdZ	 	 	 	 ddedededededef fdZd	ej                  d
ej                  fdZ
d	ej                  d
ej                  fdZd	ej                  d
ej                  fdZ xZS )xLSTMMultiHeadLayerNormam  Multi-head version of the LayerNorm layer.

        It normalizes the last dimension of the input tensor.

        The input is assumed to have the shape (batch_size, sequence_length, nh, DH), where:
        batch_size: batch size
        sequence_length: sequence length
        nh: number of heads
        DH: head dimension

        The normalization is applied over the last dimension (DH) of the input tensor.

        Args:
            num_heads: The number of heads.
            head_dim: The head dimension.
            eps: A small value to avoid division by zero.
            use_weight: Whether to use a learnable weight.
            use_bias: Whether to use a learnable bias.
            force_float32_reductions: Whether to force float32 reductions

        Returns:
            The normalized tensor with the shape (batch_size, sequence_length, nh * DH).
        	num_headshead_dimr[   r  r  r  c                 r   t         |           ||z  | _        || _        || _        |r8t        j                  t        j                  | j                              | _	        nd | _	        |r8t        j                  t        j                  | j                              | _        nd | _        || _        || _        y r   )r   r   r  r[   r  r   r  r   rc   r  r7   r  r,  r-  )r   r,  r-  r[   r  r  r  r   s          r   r   z xLSTMMultiHeadLayerNorm.__init__  s     G )H 4DDH,DD) ll5::d6G6G+HI"LLT5F5F)GH	 	&DN$DMr    r  r   c                 r    | j                   || j                   z  }| j                  || j                  z   }|S r   r  r  s     r   r  z*xLSTMMultiHeadLayerNorm._apply_weight_bias  r   r    c                    |j                   }| j                  r|j                         }||j                  dd      z
  }|t	        j
                  |j                  ddd      | j                  z         z  }|j                  |      S )Nr0   Tr^   F)r_   r`   unbiased)	r3   r  rf   r$  r   r"  varr[   r   )r   r  r%  
x_centeredys        r   _layer_normalizez(xLSTMMultiHeadLayerNorm._layer_normalize  sr    wwH,,GGIQVVDV99JU[[2te)TW[W_W_)_``A44>!r    c                 n   |j                   \  }}}}|| j                  k7  r(t        d| j                   d| d|j                          || j                  k7  r(t        d| j                   d| d|j                          | j	                  |      }|j                  ||d      }| j                  |      }|S )Nz	Expected z heads, got z, input shape: z head dimension, got r0   )r6   r,  r   r-  r5  rj   r  )r   r  r>   r   r?   DHs         r   r  zxLSTMMultiHeadLayerNorm.forward  s     34''/JRT^^# 9T^^,<LO\]\c\c[d!effT]]" 9T]]O;PQSPTTcdedkdkcl!mnn%%a(A		*or:A''*AHr    r(  )r  r	  r
  r  r)  rf   rd   r   r   r  r  r5  r  r  r  s   @r   r+  r+    s    	8 #"-1	%	% 	% 		%
 	% 	% '+	%4	 	 		"ell 	"u|| 	"	||	 \\	r    r+  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )xLSTMFeedForwardr   c                 Z   t         |           || _        t        |j                  |j
                  z  |j                        | _        | j                  j                  dk(  rt        j                  |j                  | j                  | j                  j                        | _        t        j                  |j                  | j                  | j                  j                        | _        na| j                  j                  dk(  rHt        j                  |j                  d| j                  z  | j                  j                        | _        t        j                  | j                  |j                  | j                  j                        | _        t        j                          | _        y )Nsinglein_featuresout_featuresr  fusedr   )r   r   r   r   hidden_sizeffn_proj_factorffn_round_up_to_multiple_ofup_proj_dimweight_moder   Linearr  proj_up_gateproj_upproj_up_gate_z	proj_downSiLUact_fnr   r   r   s     r   r   zxLSTMFeedForward.__init__  s=   G DK;""V%;%;;22 D
 {{&&(2$&II & 2 2!%!1!1--%!
  "yy & 2 2!%!1!1-- 
 ((G3&(ii & 2 2!"T%5%5!5--'#  YY ,,#//[[))DN '')DKr    r  r   c                    | j                   j                  dk(  r3| j                  | j                  |            | j	                  |      z  }nd| j                   j                  dk(  rK| j                  |      }t        j                  || j                  fd      \  }}| j                  |      |z  }| j                  |      }|S )Nr;  r?  r0   r   )
r   rD  rK  rF  rG  rH  r   tensor_splitrC  rI  )r   r  gatezr4  s        r   r  zxLSTMFeedForward.forward  s    {{&&(2KK 1 1! 45QG((G3''*,,Q1A1A0CLaKK%)q!AHr    )	r  r	  r
  r   r   r   r  r  r  r  s   @r   r9  r9    s+    !	$; !	$F		U\\ 		ell 		r    r9  c            
       z     e Zd Zdef fdZ	 ddej                  dee   de	ej                  ee   f   fdZ
 xZS )
xLSTMLayerr   c                 (   t         |           || _        t        |j                  |j
                  z        | _        t        |j                  |j                  z        | _        | j                  j                  dk(  rt        j                  | j                  j                  | j                  | j                  j                        | _        t        j                  | j                  j                  | j                  | j                  j                        | _        t        j                  | j                  j                  | j                  | j                  j                        | _        t        j                  | j                  j                  | j                  | j                  j                        | _        t        j                  | j                  j                  | j                  j"                  d      | _        t        j                  | j                  j                  | j                  j"                  d      | _        n| j                  j                  dk(  rt        j                  | j                  j                  d| j                  z  d| j                  z  z   | j                  j                        | _        t        j                  | j                  j                  d| j                  j"                  z  d      | _        t        j,                         | _        t1        | j                        | _        t5        | j                  j"                  | j                  | j                  j"                  z  | j                  j6                  d| j                  j                  | j                  j8                        | _        t        j                  | j                  | j                  j                  | j                  j                        | _        y )Nr;  r<  Tr?  r   r  )r,  r-  r[   r  r  r  )r   r   r   r)  r@  v_dim_factorv_dimqk_dim_factorqk_dimrD  r   rE  r  qkvogate_preactr,  igate_preactfgate_preactqkv_opreactifgate_preactSigmoidogate_act_fnr   mlstm_backendr+  norm_epsnorm_reduction_force_float32multihead_normout_projrL  s     r   r   zxLSTMLayer.__init__  s   G DKV//&2E2EEFDJf0063G3GGHDK{{&&(2 $ 7 7!%--
  $ 7 7!%--
  $ 7 7!%-- %'II $ 7 7!%--%!
 %'II $ 7 7!%!6!6%!
 %'II $ 7 7!%!6!6%!
 ((G3#%99 $ 7 7!"T[[1tzz>!A--$ 
 &(YY $ 7 7!"T[[%:%:!:&" !#

D!-T[[!AD"9++//t{{'<'<<KK((--)-)Q)Q#D II JJ![[44[[))DMr    r  stater   c           
         |j                   dk7  rt        d|j                         |j                  \  }}}| j                  j                  dk(  r| j                  |      }| j                  |      }| j                  |      }| j                  |      }	t        | j                  |      | j                  j                        }
t        | j                  |      | j                  j                        }n| j                  j                  dk(  r| j                  |      }t        j                  || j                   d| j                   z  d| j                   z  | j"                  z   fd      \  }}}}	t        | j%                  |      | j                  j                        }t        j                  || j                  j&                  fd      \  }
}j)                  ||| j                  j&                  d      j+                  d	d      }j)                  ||| j                  j&                  d      j+                  d	d      }j)                  ||| j                  j&                  d      j+                  d	d      }
j+                  d	d      }
j+                  d	d      }|d
\  }}}n|\  }}}| j-                  ||||
||||      \  }}|| j                  j&                  || j"                  | j                  j&                  z  f}|j                  |k7  rt        d|j                   d|       |j+                  d	d      }| j/                  |      }|j)                  ||d      }| j1                  	      |z  }| j3                  |      }||fS )Nr   z=Input must have shape [batch_size, sequence_length, HD], got r;  )r   r?  r   r0   r   r   )NNNr  zGot z, expected )ndimr   r6   r   rD  rX  rY  rZ  r[  r   r\  gate_soft_capr]  r^  r   rN  rW  rU  r_  r,  rj   r<   rb  re  ra  rf  )r   r  rg  r>   r   r@   r   rL   r   o_preacti_preactf_preactr^  	if_preactr   r   r   r   expected_h_shapeh_normr   r4  s                         r   r  zxLSTMLayer.forwardL  s7    vv{ #`abahah`i!jkk-.WW*J{{&&(2q	ffQiq	,,Q/#D$5$5a$8DKKD]D]^#D$5$5a$8DKKD]D]^((G3"..q1.3.@.@DKKDKK$**4
 /+sE8 %T%7%7%:dkkF_F_`	%*%7%7	DKKDYDYC[ac%d"(MM*ot{{?T?TVXYccdeghiE++j/4;;;P;PRTU__`acdeCMM*ot{{?T?TVXYccdeghiE))!Q/H))!Q/H}2B/	9i27/	9i))### * 	HAu %%

dkk333	  ww** 4y<L;M!NOOAq!A((+F^^JDF%%h/&8Ee$Ae8Or    r   )r  r	  r
  r   r   r   r  r   mLSTMLayerStateTyper  r  r  r  s   @r   rR  rR    sX    B	; B	J KOA	\\A	*23F*GA	5<<*=!>>?A	r    rR  c            	       t     e Zd Zdef fdZ	 ddej                  dee   de	ej                  ef   fdZ
 xZS )
xLSTMBlockr   c                 d   t         |           || _        t        |j                  |j
                  d|j                  |j                        | _        t        |      | _
        t        |j                  |j
                  d|j                  |j                        | _        t        |      | _        y )NT)r  r[   r  r  r  )r   r   r   r   r@  rc  r  rd  
norm_mlstmrR  mlstm_layernorm_ffnr9  ffnrL  s     r   r   zxLSTMBlock.__init__  s    G DK*#//OO)/)L)LDO  *&1D(#//OO)/)L)LDM (/DHr    r  rg  r   c                     | j                  |      }| j                  ||      \  }}||z   }| j                  |      }| j                  |      }||z   }||fS r   )ru  rv  rw  rx  )r   r  rg  x_mlstmx_ffns        r   r  zxLSTMBlock.forward  s^     ooa(G!--gu=NGUGAMM!$EHHUOEE	Ae8Or    r   )r  r	  r
  r   r   r   r  r   r   r  r  r  r  s   @r   rs  rs    sJ    	0; 	0* FJ	\\	*2>*B	5<</0	r    rs  c                 (    dd| z  z  dz  fd}|S )a>  
    Adapted from: https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/init_functions.py
    Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution.r            ?c                 \    t         j                  j                  j                  | d      S Ng        )r$  stdr   r   initnormal_tensorr  s    r   init_z small_init_method.<locals>.init_  #    xx}}$$V#3$??r    r   )r_   r  r  s     @r   small_init_methodr    s$    
 C=e
$C@ Lr    c                 (    d| z  |dz  z  fd}|S )zh
    Adapted from https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/init_functions.py
    r   r~  c                 \    t         j                  j                  j                  | d      S r  r  r  s    r   r  zwang_init_method.<locals>.init_  r  r    r   )n_layersr_   r  r  s      @r   wang_init_methodr    s$     h,
'C@ Lr    c                   2    e Zd ZdZeZdZdgZdZdZ	d Z
d Zy)xLSTMPreTrainedModelzR
    An abstract class for an interface to loading a pre-trained xLSTM model.
    backboners  Tc                 D    | j                         D ]  \  }}||u s|c S  y)N )named_modules)r   modulenamemods       r   _module_name_mapz%xLSTMPreTrainedModel._module_name_map  s/    ++- 	ID#f}	 r    c                 
   t        |t        j                        r: t        | j                  j
                        | j                  j                         y t        |t        j                        r8|j                  3t        j                  j                  j                  |j                         | j                  j                  dk(  rGd| j                  |      v r3t        j                  j                  j                  |j                         t        j                         5  d| j                  |      v r<|j                  j!                  dt        j"                  |j                        z         nd| j                  |      v r|j                  j!                  t        j$                  dd|j                  j&                  d         j)                  |j                  j*                  |j                  j,                  	             d d d        y | j                  j                  d
k(  rd| j                  |      v rxt        j                  j                  j                  |j                         t        j                         5  |j                  d | j                  j.                  xxx |j                  d | j                  j.                    dt        j"                  |j                        z  z
  z  ccc |j                  d | j                  j.                  xxx |j                  | j                  j.                  d   t        j$                  dd|j                  j&                  d         j)                  |j                  j*                  |j                  j,                  	      z   z  ccc d d d        y d| j                  |      v rI t1        |j                  j&                  d   | j                  j2                        |j                         y d| j                  |      v rF t1        | j                  j
                  | j                  j2                        |j                         y |j                  0 t        | j                  j
                        |j                         y y t        |t4              st7        |d      rt        j                  j                  j9                  |j                         t7        |d      rA|j                  4t        j                  j                  j                  |j                         y y y y # 1 sw Y   y xY w# 1 sw Y   y xY w)Nr;  rO  r   g      $r   g      @g      @r0   r   r?  g      $@rI  r   )r_   r  rf  r5  r  )
isinstancer   	Embeddingr  r   r@  
embeddingsr  rE  r  r   r  zeros_rD  r  no_gradcopy_	ones_likelinspacer6   r   r4   r3   r,  r  num_hidden_layersr   hasattrones_)r   r  s     r   _init_weightsz"xLSTMPreTrainedModel._init_weights  s   fbll+6dkk556t7M7MN		*{{&$$V[[1{{&&(2vAVAVW]A^7^$$V]]3]]_ $"7"7"??))%%//&++2N*NO D$9$9&$AA))!NN # # & 1 1" 5 !b'-{{'9'9&,kk&7&7 ! 		  ((G3$BWBWX^B_8_$$V]]3]]_ KK 7$++"7"78V[[/$++//> =uv{{;;=< <8 KK 7$++"7"78V[[I^I^I`=a<adidrdr))"-e b%{{11$kk//  	= 8	   5 5f ==d V]]%8%8%;dkkFcFcdekererst44V<<e T[[%<%<t{{GdGdeflfsfst*:!$++"9"9:6==I +-AS1THHMM.vv&6;;+B$$V[[1 ,C& 2UE   s   $C#UD(UUUN)r  r	  r
  r  r   r  base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr  r  r   r    r   r  r    s1     L"%&*#L-2r    r  c            
       b    e Zd ZdZej
                  dfdededej                  de	e
   fdZd Zy)	
xLSTMCachea[  
    Cache for xLSTM model which does not have attention mechanism and key value states.

    Arguments:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The batch size with which the model will be used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Attributes:
        seqlen_offset: int
        dtype: torch.dtype

    Example:

        ```python
        >>> from transformers import AutoTokenizer, xLSTMForCausalLM, xLSTMCache

        >>> model = xLSTMForCausalLM.from_pretrained("NX-AI/xLSTM-7b")
        >>> tokenizer = xLSTMTokenizer.from_pretrained("NX-AI/xLSTM-7b")

        >>> inputs = tokenizer(text="I am an xLSTM", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_params = xLSTMCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, cache_params=cache_params, use_cache=True)
        >>> outputs.cache_params
        xLSTMCache()
    Nr   max_batch_sizer3   r4   c                    d| _         || _        || _        t        |j                        D ci c]  }|t        j                  ||j                  |j                  |j                  g||      t        j                  ||j                  |j                  g||      t        j                  ||j                  dg||      f c}| _
        y c c}w )Nr   r2   r   )seqlen_offsetr3   r   r:   r  r   r7   r,  qk_head_dim
v_head_dim	rnn_state)r   r   r  r3   r4   r   layers          r   r   zxLSTMCache.__init__/  s     
 v778
  #V%5%5v7I7I6K\K\]!
 ^V-=-=v?Q?QRZ_hno^V-=-=qAW]^ 
 
s   BCc           
      &   | j                   D ci c]q  }|t        j                  | j                   |   d         t        j                  | j                   |   d         t        j                  | j                   |   d         fs c}| _         y c c}w )Nr   r   r   )r  r   
zeros_like)r   r  s     r   resetzxLSTMCache.resetG  s     
    !6q!9:  !6q!9:  !6q!9: 
 
s   A6B)r  r	  r
  r  r   bfloat16r   r)  r3   r   r  r   r  r   r    r   r  r    sL     L #^^ $

 
 {{	

 
0
r    r  c                   x    e Zd ZU dZeej                     ed<   dZee	   ed<   dZ
eeej                        ed<   y)xLSTMOutputz
    cache_params (`xLSTMCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    last_hidden_stateNcache_paramshidden_states)r  r	  r
  r  r   r   FloatTensor__annotations__r  r  r  r  r   r    r   r  r  R  sC       1 122)-L(:&-8<M8E%"3"345<r    r  c                        e Zd Z fdZd Zd Zee	 	 	 	 	 ddee	j                     dee	j                     dee   dee   dee   d	eeef   fd
              Z xZS )
xLSTMModelc                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        |j                  |j                        | _        d| _        | j#                          y c c}w )N)r[   F)r   r   r   r  
vocab_sizeembedding_dimr  
ModuleListr:   
num_blocksrs  blocksr   r@  rc  out_normgradient_checkpointing	post_init)r   r   r@   r   s      r   r   zxLSTMModel.__init__b  s     ,,v'8'8&:N:NOmmvGXGXAY$ZAZ%7$Z[$V%7%7V__M&+#	 %[s   &Cc                     | j                   S r   r  r  s    r   get_input_embeddingszxLSTMModel.get_input_embeddingsl  s    r    c                     || _         y r   r  )r   new_embeddings     r   set_input_embeddingszxLSTMModel.set_input_embeddingso  s	    'r    	input_idsinputs_embedsr  	use_cacheoutput_hidden_statesr   c           
         ||n| j                   j                  }||n#| j                  s| j                   j                  nd}| j                  r| j                  r|rd}|du |duz  rt        d      || j                  |      }|r>|<t        | j                   |j                  d      |j                  |j                        }|}| j                  s| j                   j                  |j                  d   k  r|sd}t        j                         5  |$t        | j                   |j                  d         }t        j                  |      }	||j                  d   k  r)|dd|t!        || j                   j                  z   |j                  d         f   }
t#        | j$                        D ]r  \  }} ||
|j&                  |         \  }
}t)        t+        |j&                  |               D ](  }||   }|j&                  |   |   j-                  |       * d|_        t |
|	dd|t!        || j                   j                  z   |j                  d         f<   || j                   j                  z  }||j                  d   k  r)|	}ddd       n|rd	nd}t#        | j$                        D ]  \  }}| j                  r>| j                  r2| j1                  |j2                  |||j&                  |   nd      \  }}n ||||j&                  |   nd      \  }}|rSt)        t+        |j&                  |               D ](  }||   }|j&                  |   |   j-                  |       * d|_        |s||fz   } |r"|xj4                  |j                  d   z  c_        | j7                  |      }|r|fz   }t9        ||
      S # 1 sw Y   TxY w)r
        cache_params (`xLSTMCache`, *optional*):
            The xLSTMCache that carries the RNN states.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   r>   )rg  r   )r  r  r  )r   r  trainingr  r  r   r  r  sizer4   r3   max_inference_chunksizer6   r   r  r  min	enumerater  r  r:   lenr  rnn_state_initial_gradient_checkpointing_func__call__r  r  r  )r   r  r  r  r  r  r   r  offsetfinal_statehidden_states_chunk	layer_idxxlstm_blockr  	state_idxlocal_rnn_stateall_hidden_statess                    r   r  zxLSTMModel.forwardr  s     %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	&&4==YI-t";<YZZ  OOI6M-%]//2=;O;OWdWjWjL & 33m6I6I!6LL(F ,'#-T[[]M`M`abMc#dL#..}=}22155*76C1T1T(TVcViVijkVl$mmm+' 3<DKK2H ?.	;9D/"."8"8"C:6+Y */s<3I3I)3T/U)V `I.7	.BO(229=iHNN_` :?6? ,  6C1T1T(TVcViVijkVl$mmm dkkAAAF! }22155" !,+, ,. ';*3DKK*@ M&	;..4==/3/P/P#,,%=I=U..y9[_0,M9 0;%COC[l44Y?ae0,M9  %*3|/E/Ei/P+Q%R \	*3I*>$..y9)DJJ?[\ 6;L2'(9]<L(L%'M* &&-*=*=a*@@&m4 1]4D D+%+
 	
k, ,s   E6OOO)NNNNN)r  r	  r
  r   r  r  r   r   r   r   
LongTensorr  rd   r   r  r  r  r  r  s   @r   r  r  `  s    (  1548-1$(/3`
E,,-`
   0 01`
 z*	`

 D>`
 'tn`
 
uk!	"`
  `
r    r  c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   y)xLSTMCausalLMOutputaP  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`xLSTMCache`, *optional*, carrying the RNN states):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlosslogitsr  r  )r  r	  r
  r  r  r   r   r  r  r  r  r  r  r  r   r    r   r  r    s\     )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<r    r  c                       e Zd Z fdZd Zd Zd Zd Z	 	 	 	 ddee	   fdZ
ee	 	 	 	 	 	 ddeej                     d	eej                     dee	   d
eej                     dee   dee   deeef   fd              Z xZS )xLSTMForCausalLMc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)r  )
r   r   r  r  r   rE  r@  r  lm_headr  rL  s     r   r   zxLSTMForCausalLM.__init__  sF     "6*yy!3!3V5F5FUSr    c                     | j                   S r   r  r  s    r   get_output_embeddingsz&xLSTMForCausalLM.get_output_embeddings  s    ||r    c                     || _         y r   r  r   new_embeddingss     r   set_output_embeddingsz&xLSTMForCausalLM.set_output_embeddings  s	    %r    c                 6    | j                   j                         S r   )r  r  r  s    r   r  z%xLSTMForCausalLM.get_input_embeddings  s    }}1133r    c                 8    | j                   j                  |      S r   )r  r  r  s     r   r  z%xLSTMForCausalLM.set_input_embeddings  s    }}11.AAr    r  c                     |r||d d dd f   }||d d dd f   }||d|i}nd|i}|j                  ||d       |S )Nr0   r  r  )r  r  )update)r   r  attention_maskr  r  r  r   model_inputss           r   prepare_inputs_for_generationz.xLSTMForCausalLM.prepare_inputs_for_generation  sl     1
 "!RS&)I( -af 5$)=+];L'3L\	RSr    r  r  labelsr  r  r   c                     | j                   |f||||d|}|d   }	| j                  |	j                  | j                  j                  j                              j                         }
| j                  s| j                  j                  |
j                  d   k  rd}t        j                         5  ||
j                  d   k  rt        |
dd|t        || j                  j                  z   |
j                  d         f   | j                  j                        |
dd|t        || j                  j                  z   |
j                  d         f<   || j                  j                  z  }||
j                  d   k  rddd       n t        |
| j                  j                        }
d}||j                  |
j                        }|
dddddf   j!                         }|dddf   j!                         }t#               } ||j%                  d|j'                  d            |j%                  d            }t)        ||
|j*                  |j,                        S # 1 sw Y   xY w)r  )r  r  r  r  r   r   N.r0   )r  r  r  r  )r  r  r   r  r3   rf   r  r   r  r6   r   r  r   r  output_logit_soft_capr4   r   r   ra   r  r  r  r  )r   r  r  r  r  r  r  r   xlstm_outputsr  r  r  r  shift_logitsshift_labelsloss_fcts                   r   r  zxLSTMForCausalLM.forward  s<     &
%'!5
 
 &a(m..t||/B/B/H/HIJPPR}}!D!Dv||TU!VF Bv||A.muq&3v8[8[/[]c]i]ijk]l+m"mmn99nF1fs6DKK4W4W+WY_YeYefgYh'iiij dkkAAAF v||A.B B fdkk&G&GHFYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D"&33'55	
 	
+B Bs   3CII&)NNNN)NNNNNN)r  r	  r
  r   r  r  r  r  r   r  r  r   r   r   r  r  rd   r   r  r  r  r  r  s   @r   r  r    s    &4B -1 z*4  1559-1-1$(/36
E,,-6
   1 126
 z*	6

 ))*6
 D>6
 'tn6
 
u))	*6
  6
r    r  )r  r  r  r   )	NNNNNNNr!   r   )r!   r   rY   )NNNNFFr!   rY   )NNNFrY   r!   )Er  dataclassesr   typingr   r   r   torch.nn.functionalr   r   r   torch.utils.checkpointtorch.nnr   
generationr	   modeling_utilsr
   utilsr   r   r   r   configuration_xlstmr   xlstm.xlstm_large.modelr   rs  r   r   r   external_xlstm	functoolsr   r   r   r   r  r  rq  dictr)  rf   rX   r   rd   r   r   r   r3   r   r   r  r   r   Moduler   r+  r9  rR  r  r  r  r  r  r  r  r  __all__r   r    r   <module>r     s	    ! "      % ) - V V , @@DN!(AellELL HI#223NN: :(5AT;U2V :bgbnbn :0 %)$()-%)%)*.$(X:llX:llX: llX: ll	X:
 \\X: \\X:  ,,X: llX: llX: !<<X: 5/X: X: X: 
u||U\\5<<7	8X:J K,llK,llK, llK,
 \\K, \\K,  ,,K, llK, llK, K, K, K, K, 
u||U\\5<<7	8K,f  $##$(#("'O||O\\O ||O ||	O
 ||O O O O 5/O !O  O O O 
u||U\\5<<?@Au||U\\5<<?@A		C
On #'"&"&#(?||?\\? ||? ||	?
 ||? <<? <<? <<? !? ? ? 
u||U5<<u||U\\SXS_S_7_1`#`aa	b?T #(===C||=C\\=C ||=C ||	=C
 ||=C =C =C =C =C [[=C 
u||U5<<u||#KLL	M=CJ #'"&"&#(#(==G||G\\G ||G ||	G
 ||G <<G <<G <<G !G G [[G 
u||U\\5<<?@Au||U\\5<<?@A		C
G` #'"&"&#(-2^^< (<||< \\< ||	<
 ||< ||< <<< <<< <<< !< <  %{{< < 
u||U5<<u||U\\SXS_S_7_1`#`aa	b<N #'"&"&#'-2^^$!S (S'S $S ||	S
 \\S ||S ||S ||S <<S <<S <<S !S S  %{{S S  !S" 
u||U5<<u||U\\SXS_S_7_1`#`aa	b#Sjt$ryy t$l7ryy 7rP")) Pd-299 -^FRYY FP RYY  F
	>2? >2BC
 C
L 	=+ 	=  	= s
% s
 s
l =+ =  =" f
+_ f
 f
Rr    