
    rh                    x   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZ d dlZd dlmZ d dlmZ dd	lmZ dd
lmZmZmZmZ  e       rd dlm Z!  ejD                  e#      Z$ G d de      Z% G d de%      Z& G d de%      Z' G d de'      Z( G d de(      Z) G d d      Z* G d de*      Z+ G d de*      Z, G d de,      Z- G d de,      Z.d ed!e/ej`                  ej`                  f   f   d"ed!e/ej`                  ej`                  f   f   fd#Z1 G d$ d%      Z2 G d& d'      Z3 G d( d)e3      Z4 ed*      rd+e4fd,Z5d-ej4                  jl                  jn                  fd.Z8ej4                  jl                  js                  e4d/ e8e4jt                   d0e4jF                   d1 2       ejv                  jl                  jy                  e4d3         G d4 d5e4      Z= G d6 d7e3      Z> G d8 d9e>      Z? G d: d;e3      Z@ G d< d=e3      ZA G d> d?eA      ZB G d@ dAeB      ZC G dB dCe4      ZD G dD dEeD      ZE G dF dGeD      ZF G dH dIe3      ZGdJeeHd      dKeId"e/eIeIf   fdLZJ	 	 	 	 	 	 	 dddMee   dNeeK   dOeeK   dPeej                  eMdf   dQeej                     dReeIeKej                  f      dSeeK   dTeeK   d"eIfdUZOe'e(e)dVZPeIeMeHd   f   eQdW<   e+e,e.dXZReIeMeHd   f   eQdY<    G dZ d[e3      ZSe G d\ d]             ZTe G d^ d_eT             ZUe G d` daeT             ZVdbeMd"efdcZWy)e    N)ABCabstractmethod)Iterable)	dataclass)AnyCallableOptionalUnion)version)"is_torch_greater_or_equal_than_2_6   )PretrainedConfig)is_hqq_availableis_optimum_quanto_availableis_torch_greater_or_equallogging)	Quantizerc                   n   e Zd ZdZdZd Ze	 ddej                  dej                  de	e
eef      deej                  ej                  f   fd	       Zeddefd
       Zedefd       Zedej                  deeef   fd       ZddZdej(                  deej                  ej                  f   fdZy)CacheLayerMixinz0Base, abstract class for a single layer's cache.Fc                 "    d\  | _         | _        y )N)NNkeysvaluesselfs    k/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/cache_utils.py__init__zCacheLayerMixin.__init__!   s    !+	4;    N
key_statesvalue_statescache_kwargsreturnc                      y N r   r   r    r!   s       r   updatezCacheLayerMixin.update$   s     -0r   c                      y r$   r%   r   cache_positions     r   get_seq_lengthzCacheLayerMixin.get_seq_length,   s    :=r   c                      y r$   r%   r   s    r   get_max_cache_shapez#CacheLayerMixin.get_max_cache_shape/   s    *-r   r*   c                      y r$   r%   r)   s     r   get_mask_sizeszCacheLayerMixin.get_mask_sizes2   s    ORr   c                 l    | j                   j                          | j                  j                          y)z4Resets the cache values while preserving the objectsN)r   zero_r   r   s    r   resetzCacheLayerMixin.reset5   s     		r   beam_idxc                    | j                   j                         rF| j                   j                  }| j                   j                  d|j	                  |            | _         | j
                  j                         rG| j
                  j                  }| j
                  j                  d|j	                  |            | _        yy)z,Reorders this layer's cache for beam search.r   N)r   numeldeviceindex_selecttor   )r   r3   r6   s      r   reorder_cachezCacheLayerMixin.reorder_cache:   s    99??YY%%F		..q(++f2EFDI;;[[''F++221hkk&6IJDK r   r$   r"   N)__name__
__module____qualname____doc__is_compileabler   r   torchTensorr	   dictstrr   tupler'   intr+   r-   r/   r2   
LongTensorr9   r%   r   r   r   r      s    :N, 
 26	0LL0 ll0 tCH~.	0
 
u||U\\)	*0 0 =S= =-S- -RU\\ReCHoR R
Ke&6&6 K5u||A[;\ Kr   r   c                      e Zd ZdZdZ	 ddej                  dej                  deee	e
f      deej                  ej                  f   fdZddefd	Zdefd
Zdej                   ddfdZdeddfdZdeddfdZdej                  ddfdZdej                  deeef   fdZedej                  dej                  dd fd       Zy)DynamicLayeraE  
    A cache layer that grows dynamically as more tokens are generated. This is the default for generative models.
    It stores the Key and Value states as tensors with shape `[batch_size, num_heads, seq_len, head_dim]`.

    See `CacheLayerMixin` for details on common methods that are implemented by all cache layers.
    FNr   r    r!   r"   c                    | j                   || _         || _        nPt        j                  | j                   |gd      | _         t        j                  | j                  |gd      | _        | j                   | j                  fS )a  
        Updates the cache with the new `key_states` and `value_states`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            cache_kwargs (`dict[str, Any]`, *optional*):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicLayer`.

        Return:
            A tuple containing the updated key and value states.
        dim)r   r   r@   catr&   s       r   r'   zDynamicLayer.updateN   se    ( 99"DI&DK		499j"9rBDI))T[[,$?RHDKyy$++%%r   c                     | j                   | j                   j                         dk(  ry| j                   j                  d   S )1Returns the sequence length of the cached states.r   rJ   )r   r5   shaper)   s     r   r+   zDynamicLayer.get_seq_lengthj   s4    99		 1Q 6yyr""r   c                      y)zeReturns the maximum sequence length of the cache object. DynamicLayer does not have a maximum length.r%   r   s    r   r-   z DynamicLayer.get_max_cache_shapep   s    r   r3   c                 d   | j                   | j                   j                         r| j                   j                  d|j                  | j                   j                              | _         | j
                  j                  d|j                  | j
                  j                              | _        yyy)DReorders the cache for beam search, given the selected beam indices.Nr   )r   r5   r7   r8   r6   r   r   r3   s     r   r9   zDynamicLayer.reorder_cachet   sw    99 TYY__%6		..q(++dii>N>N2OPDI++221hkk$++BTBT6UVDK &7 r   
max_lengthc                 *   |dk  r| j                         t        |      z
  }| j                         |k  ry| j                  R| j                  j                         r7| j                  dd|ddf   | _        | j                  dd|ddf   | _        yyy)z
        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
        negative to remove `max_length` tokens.
        r   N.)r+   absr   r5   r   )r   rV   s     r   cropzDynamicLayer.cropz   s    
 >,,.Z@J J.99 TYY__%6		#{
{A"56DI++c;J;&9:DK &7 r   repeatsc                     | j                   `| j                   j                         rE| j                   j                  |d      | _         | j                  j                  |d      | _        yyy)z8Repeat the cache `repeats` times in the batch dimension.Nr   rK   )r   r5   repeat_interleaver   r   rZ   s     r   batch_repeat_interleavez$DynamicLayer.batch_repeat_interleave   sW    99 TYY__%6		33G3CDI++77Q7GDK &7 r   indicesc                     | j                   H| j                   j                         r-| j                   |df   | _         | j                  |df   | _        yyy)z<Only keep the `indices` in the batch dimension of the cache.N.)r   r5   r   r   r_   s     r   batch_select_indicesz!DynamicLayer.batch_select_indices   sI    99 TYY__%6		'3,/DI++gsl3DK &7 r   r*   c                 V    d}|j                   d   }| j                         }||z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rP   r+   )r   r*   	kv_offsetquery_lengthpast_seen_tokens	kv_lengths         r   r/   zDynamicLayer.get_mask_sizes   s<    	%++A...0 #33	)##r   r   r   c                 0     |        }||_         ||_        |S )a  
        Build a `DynamicLayer` instance from pre-existing key/value tensors.

        Args:
            keys (`torch.Tensor`):
                Key cache tensor of shape ``[batch_size, num_heads, seq_len, head_dim]``.
            values (`torch.Tensor`):
                Value cache tensor of shape ``[batch_size, num_heads, seq_len, head_dim]``.

        Returns:
            `DynamicLayer`: The newly constructed layer whose internal cache directly references
            the supplied tensors.
        r   )clsr   r   layers       r   from_tensorszDynamicLayer.from_tensors   s     
r   r$   )r;   r<   r=   r>   
is_slidingr@   rA   r	   rB   rC   r   rD   r'   rE   r+   r-   rF   r9   rY   r^   rb   r/   classmethodrk   r%   r   r   rH   rH   D   s)    J 26	&LL& ll& tCH~.	&
 
u||U\\)	*&8#S #S We&6&6 W4 W;s ;t ;Hs Ht H4ELL 4T 4$U\\ $eCHo $  ell ~  r   rH   c                   d   e Zd ZdZdZdZej                  ddfdededed	ed
ej                  de
dee   fdZdefdZ	 ddej                  dej                  deee
ef      deej                  ej                  f   fdZddefdZdej(                  ddfdZdej                  deeef   fdZy)StaticLayerab  
    A static cache layer that stores the Key and Value states as static tensors with shape `[batch_size, num_heads, seq_len, head_dim]`.
    It allocates its full backing tensors up-front and mutates them in-place. Built for `torch.compile` support.

    See `CacheLayerMixin` for details on common methods that are implemented by all cache layers.
    TFcpuNmax_cache_len
batch_size	num_headshead_dimdtyper6   sliding_windowc                    || _         || _        || _        || _        || _        || _        t        j                  ||| j                   |f||      | _        t        j                  ||| j                   |f||      | _	        t        j                  j                  | j                         t        j                  j                  | j                         y)az  
        Args:
            max_cache_len (`int`):
                Maximum number of tokens that can be stored, used for tensor preallocation.
            batch_size (`int`):
                Maximum batch size the cache is pre-allocated for.
            num_heads (`int`):
                Number of attention heads.
            head_dim (`int`):
                Per-head hidden dimension.
            dtype (`torch.dtype`, defaults to `torch.float32`):
                Data type of the cache tensors.
            device (`str` or `torch.device`, defaults to `"cpu"`):
                Device on which the cache tensors will be materialised.

        Notes:
            Static layers allocate their full backing tensors up-front and mutate them
            in-place. See the documentation of `Cache` for shared helper methods that
            operate uniformly across all layer types.
        ru   r6   N)rq   max_batch_sizers   rt   ru   r6   r@   zerosr   r   _dynamomark_static_address)r   rq   rr   rs   rt   ru   r6   rv   s           r   r   zStaticLayer.__init__   s    < +(" 
KKD$6$6A
	
 kkD$6$6A
 	))$))4))$++6r   r"   c                     | j                   S )z+Return the maximum cache shape of the cacherq   r   s    r   r-   zStaticLayer.get_max_cache_shape   s    !!!r   r   r    r!   c                 J   |r|j                  d      nd}|j                  | j                  j                        }|j                  | j                  j                        }| j
                  |j
                  k7  re|j
                  | _        | j                  j                  | j
                        | _        | j                  j                  | j
                        | _        |7| j                  j                  |       | j                  j                  |       n;	 | j                  j                  d||       | j                  j                  d||       | j                  | j                  fS # t        $ r/ || j                  dddd|f<   || j                  dddd|f<   Y Ow xY w)a  
        Update the static cache tensors in place.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
            value_states (`torch.Tensor`): The new value states to cache.
            cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.

        Returns:
            tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value states.
        r*   N   )	getr8   r   ru   r   r6   copy_index_copy_NotImplementedError)r   r   r    r!   r*   s        r   r'   zStaticLayer.update   sG   " @L))*:;QU]]499??3
#t{{'8'89
 ;;*+++$++DK		T[[1DI++..5DK!IIOOJ'KKl+A		%%aD''><H
 yy$++%%	 ' A2<		!Q./4@Aq.01As   :E* *5F"!F"c                     |t        |d   dz         S | j                  /| j                  d   j                  d      j                         }|S d}|S )rO   rR   r   )r   r   rK   r   )rE   r   anysum)r   r*   
seq_lengths      r   r+   zStaticLayer.get_seq_length"  sb    %~b)A-.. =AII<Qdiio))b)1668
 XY
r   r3   c                     | j                   j                  }|j                  |      }| j                   j                  d|      | _         | j                  j                  d|      | _        y)rT   r   N)r   r6   r8   r7   r   )r   r3   devbeam_idx_devs       r   r9   zStaticLayer.reorder_cache+  sO    ii{{3'II**1l;	kk..q,?r   r*   c                 &    d}| j                   }||fS )NReturn the length and offset of the cache, used to generate the attention maskr   r~   )r   r*   rd   rg   s       r   r/   zStaticLayer.get_mask_sizes2  s    	&&	)##r   r$   )r;   r<   r=   r>   r?   rl   r@   float32rE   ru   rC   r	   r   r-   rA   rB   r   rD   r'   r+   rF   r9   r/   r%   r   r   ro   ro      s    NJ #]](,2727 27 	27
 27 {{27 27 !27h"S " 26	+&LL+& ll+& tCH~.	+&
 
u||U\\)	*+&ZS @e&6&6 @4 @$U\\ $eCHo $r   ro   c                        e Zd ZdZdZ fdZ	 ddej                  dej                  dee	e
ef      deej                  ej                  f   fdZd	ej                  deeef   fd
Z xZS )SlidingWindowLayerz
    A static cache layer that implements sliding window attention caching.

    See `CacheLayerMixin` for details on common methods that are implemented by all cache layers.
    Tc                 v    |j                  dd      }|t        ||      n|}t        |   g ||d|i| y)z
        Args:
            sliding_window (`int`):
                Effective window size: number of tokens that are kept on each update call.
        rq   N)popminsuperr   )r   rv   argskwargsrq   	__class__s        r   r   zSlidingWindowLayer.__init__B  sH     

?D9>K>WNM:]kM$MdMmMfMr   r   r    r!   r"   c                    |r|j                  d      nd}|t        d      | j                  |j                  k7  re|j                  | _        | j                  j	                  | j                        | _        | j
                  j	                  | j                        | _        |j	                  | j                  j                        }|j	                  | j
                  j                        }|j                  d   | j                  kD  rr|dddd| j                   dddf   }|dddd| j                   dddf   }| j                  j                  |       | j
                  j                  |       ||fS t        j                  | j                  | j                        }|d   dz   }|| j                  kD  }	||	j                         z   | j                  z  }
| j                  dddd|
f   }| j
                  dddd|
f   }|j                  d| j                  dz
        }	 |j                  d	||      }|j                  d	||      }| j                  j                  |       | j
                  j                  |       | j                  | j
                  fS # t        $ r; |j!                         }|j!                         }||dddd|f<   ||dddd|f<   Y w xY w)
a  
        Update the sliding window cache tensors in place.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
            value_states (`torch.Tensor`): The new value states to cache.
            cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.

        Returns:
            tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value states.
        r*   Nz9`cache_position` must be provided for SlidingWindowLayer.r   )r6   rR   r   )r   maxr   )r   
ValueErrorr6   r   r8   r   ru   rP   rq   r   r@   aranger   clamp
index_copyr   clone)r   r   r    r!   r*   new_knew_vslicingcurrent_seq_lento_shiftr_   k_out_shiftedv_out_shiftedupdate_positionk_out_updatedv_out_updateds                   r   r'   zSlidingWindowLayer.updateL  s   " @L))*:;QU!XYY
 ;;*+++$++DK		T[[1DI++..5DK]]499??3
#t{{'8'89 "T%7%77q!d&8&8%8%:A=>E A(:(:':'<a!?@EIIOOE"KKe$|++ ,,t11$++F(,q0"T%7%77X\\^+t/A/AA		!Q-0Aq'M2 )..1$:L:Lq:P.Q	@)44QTM)44QVM 			&-(yy$++%% # 	@)//1M)//1M3=M!Q/03?M!Q/0	@s   (&J AK K r*   c                     |j                   d   }|d   }t        j                  || j                  z
  dz   d      }t	        || j                        }||fS )r   r   r   r   )rP   r@   r   rq   r   )r   r*   re   first_cache_positionrd   rg   s         r   r/   z!SlidingWindowLayer.get_mask_sizes  s\    %++A.-a0KK 4t7I7I IA MSTU	d&8&89	)##r   r$   )r;   r<   r=   r>   rl   r   r@   rA   r	   rB   rC   r   rD   r'   rE   r/   __classcell__r   s   @r   r   r   9  s     JN 26	A&LLA& llA& tCH~.	A&
 
u||U\\)	*A&F$U\\ $eCHo $r   r   c                        e Zd ZdZ fdZ	 ddej                  dej                  deee	e
f      deej                  ej                  f   fdZd fdZd	ej                  deeef   fd
Z xZS )ChunkedSlidingLayerz
    An extended SlidingWindowLayer that supports prefill chunking, originally implemented for Llama 4.

    See `SlidingWindowLayer` for details on common methods that are implemented by all cache layers.
    c                 2    t        |   |i | d| _        y Nr   )r   r   cumulative_lengthr   r   r   r   s      r   r   zChunkedSlidingLayer.__init__  s    $)&)!"r   r   r    r!   r"   c                 b   |r|j                  d      nd }|t        d      | j                  |j                  k7  re|j                  | _        | j                  j	                  | j                        | _        | j
                  j	                  | j                        | _        | j                  }| xj                  |j                  d   z  c_        || j                  k\  }|rt        j                  | j                  d d d d dd d d f   |fd      }t        j                  | j
                  d d d d dd d d f   |fd      }|j                  d   dk(  r1| j                  j                  |       | j
                  j                  |       | j                  | j
                  fS |s||j                  d   z   | j                  kD  ro|dk(  r|}|}nt        j                  | j                  d d d d d |d d f   |fd      }t        j                  | j
                  d d d d d |d d f   |fd      }nS	 | j                  j                  d||       | j
                  j                  d||       | j                  | j
                  fS | j                  j                  |d d d d | j                   d d d f          | j
                  j                  |d d d d | j                   d d d f          ||fS # t        $ r/ || j                  d d d d |f<   || j
                  d d d d |f<   Y w xY w)Nr*   z:`cache_position` must be provided for ChunkedSlidingLayer.rJ   r   rK   r   r   )r   r   r6   r   r8   r   r   rP   rq   r@   rM   r   r   r   )	r   r   r    r!   r*   r   is_fullfull_key_statesfull_value_statess	            r   r'   zChunkedSlidingLayer.update  s    @L))*:;QU!YZZ
 ;;*+++$++DK		T[[1DI++..5DK 22*"2"22"66#t'9'99#ii1aQ;)?(LRTUO %		4;;q!QR{+C\*RXZ [ #q(		0!!"34yy$++--.1A1A!1DDtGYGYY A%",$0!"'))TYYq!=O>O=OQR7R-SU_,`fh"i$)IIt{{1aASBSASUV;V/WYe.fln$o!A		%%aD''><H 99dkk))		1t/A/A.A.CQ(FGH+Aq43E3E2E2G,JKL 111 ' A2<		!Q./4@Aq.01As   6:K6 65L.-L.c                 0    t         |           d| _        y r   )r   r2   r   r   r   s    r   r2   zChunkedSlidingLayer.reset  s    !"r   r*   c                     |j                   d   }|d   }| j                  }t        j                  ||z
  dz   d      }||k\  r||z   dz
  }||fS ||k  r||z   |kD  r	||z   }||fS |}||fS )Nr   r   r   )rP   rq   r@   r   )r   r*   re   r   rv   rd   rg   s          r   r/   z"ChunkedSlidingLayer.get_mask_sizes  s    %++A.-a0++KK 4~ E IqQ	>1&59I )## "N27Kl7Z]k7k,|;I )## 'I)##r   r$   r:   )r;   r<   r=   r>   r   r@   rA   r	   rB   rC   r   rD   r'   r2   rE   r/   r   r   s   @r   r   r     s    # 26	22LL22 ll22 tCH~.	22
 
u||U\\)	*22h#$U\\ $eCHo $r   r   c                   4   e Zd ZdZddZ	 ddddej                  dej                  d	ed
ee	e
ef      deej                  ej                  f   fdZ	 ddddej                  dej                  d	ed
ee	e
ef      deej                  ej                  f   fdZy)CacheProcessorz
    Base class for cache processors. It defines a pre-update and post-update methods that are called before and after the cache update.
    This class should be subclassed.
    cacheCacher"   Nc                 H    t        d| j                  j                   d      )a  
        Initialize the processor and perform compatibility checks with the cache.

        Args:
            cache (`Cache`): The cache instance this processor will be applied to.
            **kwargs: Additional arguments that may be needed for initialization.
        z!Make sure to implement `init` in .)r   r   r;   )r   r   r   s      r   r   zCacheProcessor.__init__  s%     "$EdnnF]F]E^^_"`aar   r   r    	layer_idxr!   c                 
    ||fS )a  
        Function called before the cache update. Can modify the key/value states.

        Args:
            cache (`Cache`): The cache instance.
            key_states (`torch.Tensor`): The new key states to cache.
            value_states (`torch.Tensor`): The new value states to cache.
            layer_idx (`int`): The index of the layer to cache the states for.
            cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.

        Returns:
            The modified key and value states.
        r%   r   r   r   r    r   r!   s         r   
pre_updatezCacheProcessor.pre_update   s    * <''r   key_tensorsvalue_tensorsc                 
    ||fS )a(  
        Function called after the cache update. Can process the cached data.

        Args:
            cache (`Cache`): The cache instance.
            key_states (`torch.Tensor`): The key states that were cached.
            value_states (`torch.Tensor`): The value states that were cached.
            layer_idx (`int`): The index of the layer that was updated.
            cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.

        Returns:
            The final key and value states to return to the model.
        r%   )r   r   r   r   r   r!   s         r   post_updatezCacheProcessor.post_update  s    * M))r   )r   r   r"   Nr$   )r;   r<   r=   r>   r   r@   rA   rE   r	   rB   rC   r   rD   r   r   r%   r   r   r   r     s    
b  26(( LL( ll	(
 ( tCH~.( 
u||U\\)	*(: 26** \\* ||	*
 * tCH~.* 
u||U\\)	**r   r   c                       e Zd ZdZddddeeej                  f   fdZ	 ddddej                  dej                  d	e
d
eeeef      deej                  ej                  f   fdZddd	e
fdZddd	e
fdZddd	e
fdZy)OffloadedCacheProcessora  
    A cache processor that offloads cache tensors to conserve accelerator memory.

    This processor manages moving cache tensors between accelerator and CPU memory,
    using asynchronous prefetching to minimize performance impact. Works with both
    dynamic and static layers.
    r   r   offload_devicec                    t        j                  |      | _        g | _        d| _        d| _        t         j                  j                         sNt        dd      rt         j                  j                         s#t        dt        dd      rdz         dz         t        d |j                  D              | _        | j                  rt        |j                        D ]  \  }}|d	k(  r|j                  d
   n| j                  }|j                   j#                  |      |_        |j$                  j#                  |      |_        | j                  j'                  |j                  d
           t)        |      |j*                  k7  rt-        d      t        dd      rt        j.                         | _        yt         j                  j/                         | _        y)z@Initialize the offload processor and check device compatibility.N2.7T
accept_devz3OffloadedCacheProcessor can only be used with a GPUz or XPU c              3   <   K   | ]  }t        |t                y wr$   )
isinstancero   .0rj   s     r   	<genexpr>z3OffloadedCacheProcessor.__init__.<locals>.<genexpr>H  s     VZ{;Vs   r   r6   z?If static layers are used, all cache layers must be initialized)r@   r6   r   original_deviceprefetch_streamr3   cudais_availabler   xpuRuntimeErrorr   layers	is_static	enumeratelayer_init_kwargsr   r8   r   appendlennum_hidden_layersr   Stream)r   r   r   r   irj   r6   s          r   r   z OffloadedCacheProcessor.__init__8  s   #ll>:!# JJ##%)%DAeiiF\F\F^E 9%D Q9[ WY[ 
 VVV>>%ell3 O5>?1f00:$J]J]"ZZ]]62
$||v6$$++E,C,CH,MN	O
 5zU444 !bcc 8$OELLN 	UZU_U_UfUfUh 	r   Nr   r    r   r!   r"   c                 $   t        |      |k  rt        d      t        |      |k(  r;| j                  j                  |j                         | j                  ||       ||fS t        dd      r-t        j                  j                         j                          n,t        j                  j                         j                          | j                  ||       | j                  ||       | j                  ||dz   t        |      z         ||fS )z5Handles prefetching and eviction before cache update.zWOffloadedCache does not support model usage where layers are skipped. Use DynamicCache.r   Tr   r   )r   r   r   r   r6   _evict_previous_layerr   r@   acceleratorcurrent_streamsynchronizer   _ensure_layer_on_device_prefetch_layerr   s         r   r   z"OffloadedCacheProcessor.pre_updateV  s     u:	!vwwZ9$  ''
(9(9:&&ui8 <'' )4@!!002>>@

))+779&&ui8((	:   Q#e*(DE<''r   c                    |t        |      k  rt        dd      r| j                  n(t        j                  j                  | j                        5  | j                  |   }|j                  |   j                  j                  |d      |j                  |   _        |j                  |   j                  j                  |d      |j                  |   _
        ddd       yy# 1 sw Y   yxY w)z(Starts prefetching the next layer cache.r   Tr   non_blockingN)r   r   r   r@   r   streamr   r   r   r8   r   )r   r   r   r6   s       r   r   z'OffloadedCacheProcessor._prefetch_layerr  s    s5z! -UtD $$ZZ&&t';';<n --i8/4||I/F/K/K/N/Nvdh/N/iY',16i1H1O1O1R1RSYhl1R1mY'.n n "n ns   BC##C,c                 \   t        |      dk\  r|dz
  t        |      z  }|j                  |   j                  j                  | j                  d      |j                  |   _        |j                  |   j
                  j                  | j                  d      |j                  |   _        yy)z*Moves the previous layer cache to the CPU.r   r   Tr   N)r   r   r   r8   r   r   )r   r   r   prev_layer_idxs       r   r   z-OffloadedCacheProcessor._evict_previous_layer  s    u:?'!ms5z9N05^0L0Q0Q0T0T##$ 1U 1ELL(- 38,,~2N2U2U2X2X##$ 3Y 3ELL(/ r   c                    |t        |      k  r| j                  j                          | j                  | j                  j	                  | j
                  |         | _        |j                  |   j                  j                  d| j                        |j                  |   _        |j                  |   j                  j                  d| j                        |j                  |   _	        yyy)z4Ensures the current layer is on the original device.Nr   )
r   r   r   r3   r8   r   r   r   r7   r   )r   r   r   s      r   r   z/OffloadedCacheProcessor._ensure_layer_on_device  s    s5z!  ,,. }}( $ 0 01E1Ei1P Q/4||I/F/K/K/X/XYZ\`\i\i/jY',16i1H1O1O1\1\]^`d`m`m1nY'. ) "r   rp   r$   )r;   r<   r=   r>   r
   rC   r@   r6   r   rA   rE   r	   rB   r   rD   r   r   r   r   r%   r   r   r   r   /  s    
g 
uS%,,=N7O 
H 26(( LL( ll	(
 ( tCH~.( 
u||U\\)	*(8nW n n
7 
s 

oW 
o 
or   r   c                      e Zd ZdZddddddej
                  dfdd	d
ededededededej                  defdZ	d Z
	 d dd	dej                  dej                  dedeeeef      deej                  ej                  f   fdZdej                  dedej                  fdZdej                  dej                  fdZdedefdZy)!QuantizedCacheProcessorz
    A cache processor that applies quantization to cache tensors to reduce memory usage.

    This processor quantizes cache tensors after they are stored, maintaining a residual
    length in original precision and quantizing older tokens.
    quanto   r   @      rp   r   r   backendnbitsaxis_key
axis_valueq_group_sizeresidual_lengthcompute_dtyper6   c
                    || _         || _        || _        || _        || _        || _        || _        |	| _        g | _        g | _	        | j                          d| _        t        |j                  d   t              st        d      y)a  
        Parameters:
            backend (`str`, defaults to `"quanto"`):
                Backend to use when performing quantization, Can be one of [`quanto`, `HQQ`]
            nbits (`int`, defaults to 4):
                Number of bits, can be 2 or 4 for the `quanto` backend and one of [1, 2, 3, 4, 8] for the `HQQ` backend. Defaults to 2.
            axis_key (`int`, defaults to 0):
                Axis over which to perform grouping for the key tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend.
            axis_value (`int`, defaults to 0):
                Axis over which to perform grouping for the value tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend.
            q_group_size (`int`, defaults to 64):
                Size of the quantization group, should be a divisor of the model's hidden dimension.
                Defaults to 64.
            residual_length (`int`, defaults to 128):
                Length of the residual cache which will always be stored in original precision.
                Defaults to 128.
            compute_dtype (`torch.dtype`, defaults to `torch.float16`):
                The default dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
            device (`str`, defaults to `"cpu"`):
                Device on which to perform computations, should be same as the model's device.
        r   z<QuantizedCacheProcessor is only compatible with DynamicCacheN)r   r   r   r   r   r   r   r6   _quantized_keys_quantized_valuesvalidateerased_lengthr   r   rH   r   )
r   r   r   r   r   r   r   r   r   r6   s
             r   r   z QuantizedCacheProcessor.__init__  s    B 
 $(.*3557 %,,q/<8[\\ 9r   c                    d}| j                   dvr't        |j                  dd| j                               | j                  dk  r't        |j                  dd| j                              | j                  dk  r't        |j                  d	d| j                              | j
                  d
vr't        |j                  dd| j
                              | j                  d
vr't        |j                  dd| j                              yz-Validates if the arguments passed are correctzvSome of the keys in `cache_config` are defined incorrectly. `{key}` should be {correct_value}` but found {found_value}r   r      r      r   z2 or 4 or 8)keycorrect_valuefound_valuer   r   za positive integerr   )r   r   rR   r   z`1` or `0`, `-1`r   z`1` or `0` or `-1`Nr   r   formatr   r   r   r   r   incorrect_arg_msgs     r   r  z QuantizedCacheProcessor.validate  @   & 	 ::_,!(("/ $

 )   !!((&"6 $ 1 1 )   !#!(()"6 $ 4 4 )   ==
*!((""4 $ )   ??*,!(($"6 $ )   -r   Nr   r   r   r!   r"   c                    t        |      |k  rt        d      | j                  |      r | j                  j	                  | j                  |j                         | j                               | j                  j	                  | j                  |j                         | j                               |j                  d   | _        t        j                  d|j                  |j                        |j                   |   _        t        j                  d|j                  |j                        |j                   |   _        ||}}||fS | j'                  | j                  |         }| j'                  | j                  |         }	t        j(                  ||gd      }t        j(                  |	|gd      }|j                  d   | j*                  k\  r| j                  |j                         | j                        | j                  |<   | j                  |j                         | j                        | j                  |<   | xj                  |j                  d   z  c_        t        j                  d|j                  |j                        |j                   |   _        t        j                  d|j                  |j                        |j                   |   _        ||fS )z&Apply quantization after cache update.zWQuantizedCache does not support model usage where layers are skipped. Use DynamicCache.)axisrJ   r   rx   rK   )r   r   _is_quantized_length_zeror  r   	_quantize
contiguousr   r  r   rP   r  r@   rz   ru   r6   r   r   r   _dequantizerM   r   )
r   r   r   r   r   r!   keys_to_returnvalues_to_returndequant_keydequant_values
             r   r   z#QuantizedCacheProcessor.post_update  s    u:	!vww
 )))4  ''{7M7M7OVZVcVc(de""))$..9Q9Q9SZ^ZiZi.*jk "-!2!22!6D+0;;!''")),ELL#(
 .3[[#))$++.ELL#* 0;M,N4 ///- **4+?+?	+JKK ,,T-C-CI-NOM"YY['ArJN$yy-)GRP  $(<(<<26..AZAZA\cgcpcp.2q$$Y/48NNCSC^C^C`gkgvgvN4w&&y1 ""k&7&7&;;"/4{{%++&--0Y',
 27'--(//2Y'. ///r   tensorr  c                     t        d      )zHQuantize a tensor - to be implemented by specific quantization backends.z4Quantization backend must implement _quantize methodr   )r   r  r  s      r   r  z!QuantizedCacheProcessor._quantizeD  s    !"XYYr   c                     t        d      )zJDequantize a tensor - to be implemented by specific quantization backends.z6Quantization backend must implement _dequantize methodr  )r   r  s     r   r  z#QuantizedCacheProcessor._dequantizeH  s    !"Z[[r   c                 2    |t        | j                        k\  S )zCheck if quantized cache is empty for layer. Note: shape[-2] is unreliable since quantized tensors are bit-packed and flattened.)r   r  r   r   s     r   r  z1QuantizedCacheProcessor._is_quantized_length_zeroL  s    C 4 4555r   r$   )r;   r<   r=   r>   r@   float16rC   rE   ru   r   r  rA   r	   rB   r   rD   r   r  r  boolr  r%   r   r   r   r     sZ     "%*]]1]1] 1] 	1]
 1] 1] 1] 1] {{1] 1]f2t 26;0;0 \\;0 ||	;0
 ;0 tCH~.;0 
u||U\\)	*;0zZ ZC ZELL Z\%,, \5<< \63 64 6r   r   c                        e Zd ZdZddddddej
                  dfdd	d
ededededededej                  deddf fdZ	dej                  dedej                  fdZdej                  dej                  fdZ xZS )QuantoQuantizedCacheProcessorz
    Quantized cache processor that uses `quanto` as a backend to perform quantization.
    Current implementation supports `int2` and `int4` dtypes only.
    r   r   r   r   r   rp   r   r   r   r   r   r   r   r   r   r6   r"   Nc
                 `   t         |   |||||||||		       |dk7  rt        d|       t               rct	        j
                  t        j                  j	                  d            }
|
t	        j
                  d      k  rt        d|
 d      ddl	m
}m}m} | j                  d	vrt        d
| j                         | j                  dvrt        d| j                         | j                  dvrt        d| j                         | j                  dk(  rn| _                | _        y)z-Initialize the quanto quantization processor.r   zFQuantoQuantizedCacheProcessor only supports `quanto` backend, but got zoptimum-quantoz0.2.5zYou need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCacheProcessor`. Detected version r   r   )MaxOptimizerqint2qint4)r   r   zA`nbits` for `quanto` backend has to be one of [`2`, `4`] but got )r   rR   zE`axis_key` for `quanto` backend has to be one of [`0`, `-1`] but got zG`axis_value` for `quanto` backend has to be one of [`0`, `-1`] but got r   N)r   r   r   r   r   parse	importlibmetadataImportErroroptimum.quantor(  r)  r*  r   r   r   qtype	optimizer)r   r   r   r   r   r   r   r   r   r6   optimum_quanto_versionr(  r)  r*  r   s                 r   r   z&QuantoQuantizedCacheProcessor.__init__W  sL    	7E8ZXegm	
 hefmenopp&(%,]]93E3E3M3MN^3_%`"%w)??! Z  [q  Zr  rs  t  BA::V#`aeakak`lmnn=='deiererdstuu??')YZ^ZiZiYjk  #jjAoU5
%r   r  r  c                     t               rTddlm} | j                  || j                  || j
                        \  }} ||| j                  |||| j
                        }|S y)z%Quantize tensor using quanto backend.r   )quantize_weightN)r   r/  r4  r1  r0  r   )r   r  r  r4  scale	zeropointqtensors          r   r  z'QuantoQuantizedCacheProcessor._quantize  sW    &(6#~~fdjj$HYHYZE9%fdjj$yRVRcRcdGN )r   r7  c                 "    |j                         S )z'Dequantize tensor using quanto backend.)
dequantize)r   r7  s     r   r  z)QuantoQuantizedCacheProcessor._dequantize  s    !!##r   )r;   r<   r=   r>   r@   r#  rC   rE   ru   r   rA   r  r  r   r   s   @r   r&  r&  Q  s      "%*]](((( (( 	((
 (( (( (( (( {{(( (( 
((T C ELL $5<< $ELL $r   r&  c                       e Zd ZdZddddddej
                  dfdd	d
ededededededej                  deddf fdZ	dej                  dedeej                  ef   fdZdeej                  ef   dej                  fdZ xZS )HQQQuantizedCacheProcessorz
    Quantized cache processor that uses `HQQ` as a backend to perform quantization.
    Current implementation supports `int2`, `int4`, `int8` dtypes.
    r   r   r   r   r   rp   r   r   r   r   r   r   r   r   r   r6   r"   Nc
                 T   t         
|   |||||||||		       |dk7  rt        d|       | j                  dvrt        d| j                         | j                  dvrt        d| j                         | j
                  dvrt        d| j
                         t        | _        y)	z*Initialize the HQQ quantization processor.r   zCHQQQuantizedCacheProcessor only supports `quanto` backend, but got r  zM`nbits` for `HQQ` backend has to be one of [`1`, `2`, `3`, `4`, `8`] but got )r   r   zA`axis_key` for `HQQ` backend has to be one of [`0`, `1`] but got zC`axis_value` for `HQQ` backend has to be one of [`0`, `1`] but got N)r   r   r   r   r   r   HQQQuantizer	quantizer)r   r   r   r   r   r   r   r   r   r6   r   s             r   r   z#HQQQuantizedCacheProcessor.__init__  s     	7E8ZXegm	
 hbcjbklmm::_,_`d`j`j_kl  ==&`aeanan`opqq??&(bcgcrcrbstuu%r   r  r  c                    | j                   j                  ||| j                  | j                  | j                  | j
                        \  }}| j                  |d<   | j                   j                  ||| j                         |d   j                  |j                        |d<   |d   j                  |j                        |d<   ||fS )z"Quantize tensor using HQQ backend.)r  r6   r   r   
group_sizer   )metar6   r5  zero)r>  quantizer6   r   r   r   r   r8   )r   r  r  r7  rA  s        r   r  z$HQQQuantizedCacheProcessor._quantize  s    //;;,,**(( 0 
 !% 2 2_G$t{{CW((8WF|w~~6V}r   qtensor_and_metac                 H    |\  }}| j                   j                  ||      }|S )z$Dequantize tensor using HQQ backend.)r>  r9  )r   rD  quant_tensorrA  r  s        r   r  z&HQQQuantizedCacheProcessor._dequantize  s'    -d**<>r   )r;   r<   r=   r>   r@   r#  rC   rE   ru   r   rA   rD   rB   r  r  r   r   s   @r   r;  r;    s      "%*]]&& & 	&
 & & & & {{& & 
&B C E%,,PTBT<U  E%,,2D,E %,, r   r;  fn.r"   c                     t        j                         	 ddt        j                  dt        j                  dt        dt
        t        t        t        f      dt        t        j                  t        j                  f   f
 fd       }|S )Nr   r    r   r!   r"   c                     | j                   "| j                   j                  | ||||      \  }} | ||||      \  }}| j                   "| j                   j                  | ||||      \  }}||fS )zM
        Wrapper around the update method to apply cache processors.
        )cache_processorr   r   )r   r   r    r   r!   r   r   rG  s          r   _wrapped_updatez)apply_processors.<locals>._wrapped_update  s     +'+';';'F'Fj,	<($J &(j,	S_%`"]+)-)=)=)I)Ik=)\*&K M))r   r$   )
	functoolswrapsr@   rA   rE   r	   rB   rC   r   rD   )rG  rK  s   ` r   apply_processorsrN    s     __R 26*LL* ll* 	*
 tCH~.* 
u||U\\)	** *0 r   c                   6    e Zd ZdZd	dZd Zd Zd Zd Zd Z	y)
KeyValuesWrapperzHelper class for Cache that simulates layer-indexed key/value lists from a layered cache.
    This allows for BC access and writing, e.g., cache.key_cache[idx] = ...
    Deprecated in favor of Cache.layers[idx].keys/values. TODO: remove in v4.56.0c                      || _         || _        y r$   )r   
cache_type)r   r   rR  s      r   r   zKeyValuesWrapper.__init__  s    $r   c                     t        |t              r1| j                  |   D cg c]  }t        || j                         c}S t        | j                  |   | j                        S c c}w r$   )r   slicer   getattrrR  )r   idxrj   s      r   __getitem__zKeyValuesWrapper.__getitem__  sQ    c5!AESAQRGE4??3RRt{{3'99 Ss   A%c                     t        |t              r9t        | j                  |   |      D ]  \  }}t	        || j
                  |        y t	        | j                  |   | j
                  |       y r$   )r   rT  zipr   setattrrR  )r   rV  valuerj   vals        r   __setitem__zKeyValuesWrapper.__setitem__  sZ    c5!!$++c"2E: 5
st45 DKK$doou=r   c                 ,    t        | j                        S r$   )r   r   r   s    r   __len__zKeyValuesWrapper.__len__  s    4;;r   c              #   ^   K   | j                   D ]  }t        || j                          y wr$   )r   rU  rR  r   rj   s     r   __iter__zKeyValuesWrapper.__iter__  s*     [[ 	2E%11	2s   +-c                 ,    t        | j                        S r$   )r$  r   r   s    r   __bool__zKeyValuesWrapper.__bool__  s    DKK  r   N)r   )
r;   r<   r=   r>   r   rW  r]  r_  rb  rd  r%   r   r   rP  rP    s&    U%:
> 2!r   rP  c                   ,   e Zd ZdZ	 	 	 	 	 	 	 	 d+deeee      ee   f   dee	   deee
ee   f      dee   dee   deej                  e
df   d	eej                     d
eeeej                  f      dee   fdZdedeej&                  ej&                  f   fdZd Zd Zd ZdeddfdZe	 d,dej&                  dej&                  dedeee
ef      deej&                  ej&                  f   f
d       Zd-dedefdZdej&                  dedeeef   fdZedefd       Z edefd       Z!d.dedefdZ"d Z#dejH                  fd Z%d!efd"Z&d#efd$Z'd%ej&                  fd&Z(edefd'       Z)edefd(       Z*ede+fd)       Z,edee+   fd*       Z-y)/r   a  
    Base container for per-layer key/value caches.

    A `Cache` behaves like a list of `CacheLayerMixin` objects, one per model layer.
    Sub-classes such as `DynamicCache`, `StaticCache`, or `SlidingWindowCache`
    simply pre-select which `CacheLayerMixin` class to use and may attach a
    `CacheProcessor` (off-loading, quantization).

    Example
    -------
    ```python
    from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache

    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
    tok   = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    inputs = tok("Hello", return_tensors="pt")

    cache = DynamicCache()
    outputs = model(**inputs, past_key_values=cache, use_cache=True)
    ```

    Parameters:
        layer_classes (`type[CacheLayerMixin]` or `list[type[CacheLayerMixin]]`):
            A list of `CacheLayerMixin` classes to instantiate for the cache. If only a `CacheLayerMixin` class is
            provided, then it is used for all layers.
        config (`PretrainedConfig`, *optional*):
            Model configuration used to infer number of layers, head sizes, default
            device/dtype, etc.
        cache_processor (`CacheProcessor` or `str`, *optional*):
            Cache processor to apply (e.g., "offloaded", "quanto_quantized", "hqq_quantized")
            or a CacheProcessor class.
        max_batch_size (`int`, *optional*): Maximum batch size for static caches.
        max_cache_len (`int`, *optional*): Maximum sequence length. For hybrid caches, SlidingWindowLayers are
            clamped to `min(sliding_window, max_cache_len)`, StaticLayers use full `max_cache_len`.
        device (`torch.device`, *optional*): Device for cache tensors.
        dtype (`torch.dtype`, *optional*): Data type for cache tensors.
        layer_device_map (`dict[int, Union[str, torch.device]]`, *optional*): Per-layer device mapping.
        tp_size (`int`, *optional*): Tensor parallel size to adjust the number of key/value heads.

    Additional keyword arguments are forwarded to the chosen layers constructor(s) and CacheProcessors. See the
    documentation of the relevant `CacheLayerMixin` class and `CacheProcessor` class for more details.
    Nlayer_classesconfigrJ  ry   rq   r6   ru   layer_device_maptp_sizec
                 T   g | _         || _        t        |t              r	t        |   n|}|
j                  ||||||	       t        ||
      \  }}
t        |fi |
| _        t        |dd      | _
        | j                  | j                  dz
         | || fi || _        y d | _        y )N)ry   rq   r6   ru   rh  ri  r   r   )r   rf  r   rC   PROCESSOR_CLASS_MAPr'   parse_processor_args"parse_layer_args_from_model_configr   rU  r   append_new_layersrJ  )r   rf  rg  rJ  ry   rq   r6   ru   rh  ri  r   processor_classprocessor_kwargss                r   r   zCache.__init__8  s     .0*BL_^aBb-o>hw)'- 	 	
 $8#P &!CF!Uf!U!(1Da!Ht559:L[LgtH7GHmqr   r   r"   c                     |t        | j                        k  r2| j                  |   j                  | j                  |   j                  fS t	        dt        | j                         d|       z
        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
        sequence length.
        zCache only has z. layers, attempted to access layer with index )r   r   r   r   KeyErrorr"  s     r   rW  zCache.__getitem__Y  sh    
 s4;;'';;y)..I0F0M0MMM!#dkk"2!33abkalm r   c              #      K   t        t        |             D ]6  }| j                  |   j                  | j                  |   j                  f 8 ywz
        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
        keys and values
        N)ranger   r   r   r   r"  s     r   rb  zCache.__iter__e  sK     
 s4y) 	OI;;y)..I0F0M0MNN	Os   AAc                 ^   t        | dd      #t        | dd      t        | j                        S yt        | dd      duxrT t        | j                        dk(  xr: t	        | j                  d   t
              xr | j                  d   j                  du }|st        | j                        S dS )
        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
        to the number of layers in the model.
        r   N	key_cacher   r   )rU  r   ry  r   r   rH   r   )r   dynamic_emptys     r   r_  zCache.__len__m  s     44(0t[$/;4>>** D(D)5 ,DKK A%,4;;q><8, A##t+	 	 (5s4;;;!;r   c                 N    | j                   j                   d| j                   dS )Nz(layers=))r   r;   r   r   s    r   __repr__zCache.__repr__  s$    ..))*(4;;-qAAr   c                    t        | j                        |k  r| j                  j                         }| j                  j	                  dd      *|j                  d      t        | j                           |d<   t        | j                  t              r"| j                  t        | j                           n| j                  } |di |}| j                  j                  |       t        | j                        |k  ryy)a  
        Appends layers to the cache until the layer `layer_idx` is reached.
        Used for preallocation in static caches and on the fly in dynamic caches.

        Args:
            layer_idx (`int`):
                The index of the layer to append.
        rh  Nr6   r%   )
r   r   r   copyr   r   r   rf  listr   )r   r   r   new_layer_class	new_layers        r   rn  zCache.append_new_layers  s     $++)+++002F%%))*<dCO#)::.@#A#dkkBR#Sx  9C4CUCUW[8\""3t{{#34bfbtbt  (1&1IKKy) $++)+r   r   r    r!   c                 d    | j                  |       | j                  |   j                  |||      S )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`dict[str, Any]`, *optional*):
                Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
                cache to be created.

        Return:
            A tuple containing the updated key and value states.
        )rn  r   r'   )r   r   r    r   r!   s        r   r'   zCache.update  s0    2 	y){{9%,,Z|TTr   c                 &   |t        | j                        k\  ry| j                  Ot        | j                  t              r5| j                  j
                  | j                  |   j                  |      z   S | j                  |   j                  |      S )zhReturns the sequence length of the cache for the given layer. TODO: deprecate in favor of cache_positionr   )r   r   rJ  r   r   r  r+   r   r   r*   s      r   r+   zCache.get_seq_length  sx    DKK((+
4;O;OQh0i''55I8N8]8]^l8mmm{{9%44^DDr   r*   c                 L    | j                   |   j                  |      \  }}||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        )r   r/   )r   r*   r   rg   rd   s        r   r/   zCache.get_mask_sizes  s-      ${{95DD^T	9)##r   c                 X    t         j                  d       t        | j                  d      S )zgList-like object of key cache tensors indexed by layer. Deprecated in favor of `cache.layers[idx].keys`zj`cache.key_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].keys` instead.r   loggerwarning_oncerP  r   r   s    r   ry  zCache.key_cache  s(     	x	
  V44r   c                 X    t         j                  d       t        | j                  d      S )zkList-like object of value cache tensors indexed by layer. Deprecated in favor of `cache.layers[idx].values`zn`cache.value_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].values` instead.r   r  r   s    r   value_cachezCache.value_cache  s(     	|	
  X66r   c                 <    | j                   |   j                         S )zaReturns maximum sequence length of the cache object. Dynamic caches do not have a maximum length.)r   r-   r"  s     r   r-   zCache.get_max_cache_shape  s    {{9%99;;r   c                     t        t        | j                              D ]  }| j                  |   j                          ! y)z$Recursively reset all layers tensorsN)rv  r   r   r2   r"  s     r   r2   zCache.reset  s4    s4;;/0 	+IKK	"((*	+r   r3   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)z!Reorder the cache for beam searchN)rv  r   r   r9   )r   r3   r   s      r   r9   zCache.reorder_cache  s6    s4;;/0 	;IKK	"00:	;r   rV   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)z"Crop the cache to the given lengthN)rv  r   r   rY   )r   rV   r   s      r   rY   z
Cache.crop  s6    s4;;/0 	4IKK	"''
3	4r   rZ   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)zRepeat and interleave the cacheN)rv  r   r   r^   )r   rZ   r   s      r   r^   zCache.batch_repeat_interleave  s8    s4;;/0 	DIKK	"::7C	Dr   r_   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)zSelect indices from the cacheN)rv  r   r   rb   )r   r_   r   s      r   rb   zCache.batch_select_indices  s8    s4;;/0 	AIKK	"77@	Ar   c                     | j                   D cg c]  }|j                   }}t        t        |            dkD  rt	        d|       |d   S c c}w )z*Return the maximum batch size of the cacher   z0Max batch size is not consistent across layers: r   )r   ry   r   setr   r   rj   r   s      r   ry   zCache.max_batch_size  sV     59KK@5%&&@@s6{aOPVxXYYay As   Ac                 h    | j                   D cg c]  }|j                   }}t        |      S c c}w )z,Return the maximum cache length of the cache)r   rq   r   r  s      r   rq   zCache.max_cache_len  s1     48;;?%%%%??6{ @s   /c                 :    t        d | j                  D              S )z'Return whether the cache is compileablec              3   4   K   | ]  }|j                     y wr$   )r?   r   s     r   r   z'Cache.is_compileable.<locals>.<genexpr>  s     AE5''As   )allr   r   s    r   r?   zCache.is_compileable  s     AT[[AAAr   c                 V    | j                   D cg c]  }t        |dd       c}S c c}w )z9Return whether the layers of the cache are sliding windowrl   F)r   rU  ra  s     r   rl   zCache.is_sliding  s'     BFM|U3MMMs   &)NNNNNNNNr$   r   N)r   ).r;   r<   r=   r>   r
   r  typer   r	   r   rC   r   rE   r@   r6   ru   rB   r   rD   rA   rW  rb  r_  r}  rn  rN  r   r'   r+   r/   propertyrP  ry  r  r-   r2   rF   r9   rY   r^   rb   ry   rq   r$  r?   rl   r%   r   r   r   r     s   )\ .2FJ(,'+15'+>B!%rT$"78$:OOPr )*r "%T.-A(A"BC	r
 !r  }r ellC-.r $r #4U\\(9#:;r #rB
S 
U5<<3M-N 
O<&B*3 *4 *(  26ULLU llU 	U
 tCH~.U 
u||U\\)	*U U6E E E$U\\ $c $eTWY\T\o $ 5+ 5 5 7- 7 7<S < <+
;e&6&6 ;
4s 4
Ds D
AELL A
    s  
 B B B NDJ N Nr   r   c                       e Zd ZdZd
deeeej                  ej                  f         f fdZ	deeej                  ej                  f   df   fdZ
edeeej                  ej                  f   df   ddfd	       Z xZS )DynamicCachea(  
    A cache that grows dynamically as more tokens are generated. This is the default for generative models.

    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
    `[batch_size, num_heads, seq_len, head_dim]`.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> past_key_values = DynamicCache()
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        DynamicCache()
        ```
    ddp_cache_datac                     t        |   |dt        i| |:|D ]4  \  }}| j                  j	                  t        j                  ||             6 y y Nrf  )r   r   rH   r   r   rk   )r   r  r   r   r   r    r   s         r   r   zDynamicCache.__init__*  s\    dE|EfE %,: X(
L""<#<#<Z#VWX &r   r"   .c                 d    d}| j                   D ]  }||j                  |j                  ffz  }  |S )z
        Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for
        backward compatibility.
        r%   )r   r   r   )r   legacy_cacherj   s      r   to_legacy_cachezDynamicCache.to_legacy_cache6  s<    
 [[ 	:Eejj%,,799L	:r   past_key_valuesr   c                      |        }|4t        t        |            D ]  }||   \  }}|j                  |||        |S )z
        Converts a cache in the legacy cache format into an equivalent `Cache`. Used for
        backward compatibility.
        )rv  r   r'   ri   r  r   r   r   r    s         r   from_legacy_cachezDynamicCache.from_legacy_cache@  sR     &"3#78 B	+:9+E(
LZyAB r   r$   )r;   r<   r=   r>   r	   r   rD   r@   rA   r   r  rm   FloatTensorr  r   r   s   @r   r  r    s    6
Xxu||U\\?Y9Z0['\ 
XuU\\5<<-G'H#'M!N  
eE<M<MuO`O`<`6acf6f0g 
ls 
 
r   r  z2.3r   c                 `   t        d | j                  D              rt        d      t        st        j                  d       | j                  D cg c]  }|j                  |j                   c}| j                  D cg c]  }|j                  |j                   c}dS c c}w c c}w )Nc              3   >   K   | ]  }t        |t                 y wr$   )r   rH   r   s     r   r   z"_get_cache_dict.<locals>.<genexpr>S  s     Mu:e\22Ms   zFThis pytree flattening function should only be applied to DynamicCachez[DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.)ry  r  )r   r   r   r   r  r  r   r   )r   rj   s     r   _get_cache_dictr  R  s    MMMghh1m
 38,,Y%**BX%**Y6;ll_UellF^ELL_
 	
Y_s   B&#B&B+B+contextc                    t         j                  j                  j                  | |      }t	               }|j                  dg       }|j                  dg       }t        t        t        |      t        |                  D ]?  }|t        |      k  r||   nd }|t        |      k  r||   nd }|j                  |||       A |S )Nry  r  )
r@   utils_pytree_dict_unflattenr  r   rv  r   r   r'   )	r   r  
dictionaryr   key_list
value_listrV  r  r[  s	            r   _unflatten_dynamic_cacher  `  s     [[((88I
>>+r2^^M26
S]C
O<= 	*C#&X#6(3-DC'*S_'<JsO$ELLeS)	* r   c                 f    t         j                  j                  j                  t	        |             S r$   )r@   r  r  _dict_flattenr  dynamic_caches    r   <lambda>r  q  s!    ekk11??P]@^_ r   r   c                 f    t         j                  j                  j                  t	        |             S r$   )r@   r  r  _dict_flatten_with_keysr  r  s    r   r  r  t  s#    5;;3F3F3^3^M*4
 r   )serialized_type_nameflatten_with_keys_fnc                 h    t         j                  j                  j                  t	        |       |      S r$   )r@   fxr  _dict_flatten_specr  )r   specs     r   r  r  z  s$    %((*:*:*M*Mo^cNdfj*k r   c                   $     e Zd ZdZd fdZ xZS )OffloadedCachea  
    A drop-in replacement for DynamicCache that conserves accelerator(GPU, XPU) memory at the expense of more CPU memory.
    Useful for generating from models with very long context.

    In addition to the default accelerator stream, where all forward() computations happen,
    this class uses another stream, the prefetch stream, which it creates itself.
    Since scheduling of operations on separate streams happens independently, this class uses
    the prefetch stream to asynchronously prefetch the KV cache of layer k+1 when layer k is executing.
    The movement of the layer k-1 cache to the CPU is handled by the default stream as a simple way to
    ensure the eviction is scheduled after all computations on that cache are finished.
    c                 .    t         |   t               y )N)rJ  r   r   r   r   s    r   r   zOffloadedCache.__init__  s    )@Ar   r:   r;   r<   r=   r>   r   r   r   s   @r   r  r  ~  s    
B Br   r  c                   "     e Zd ZdZ fdZ xZS )StaticCachea  
    Static Cache class to be used with `torch.compile(model)` and `torch.export()`.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache

        >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

        >>> inputs = tokenizer(text="My name is Llama", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
        >>> past_key_values = StaticCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        StaticCache()
        ```
    c                 0    t        |   |dt        i| y r  )r   r   ro   r   s      r   r   zStaticCache.__init__  s    TD{DVDr   r  r   s   @r   r  r    s    2E Er   r  c                   $     e Zd ZdZd fdZ xZS )OffloadedStaticCacheaG  
    A drop-in replacement for StaticCache that conserves accelerator memory by offloading
    cache tensors to CPU when not actively being used.

    This cache maintains the compilation-friendly properties of StaticCache while enabling
    much longer sequences by offloading inactive layers to CPU memory.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:
        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, OffloadedStaticCache

        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

        >>> inputs = tokenizer(text="My name is GPT2", return_tensors="pt")

        >>> # Prepare a cache class with offloading
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
        >>> past_key_values = OffloadedStaticCache(
        ...     config=model.config,
        ...     max_batch_size=1,
        ...     max_cache_len=max_generated_length,
        ...     device=model.device,
        ...     dtype=model.dtype
        ... )
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache with offloaded layers
        OffloadedStaticCache()
        ```
    c                 0    t        |   |dt        i| y NrJ  r  r   s      r   r   zOffloadedStaticCache.__init__      $R0GR6Rr   r:   r  r   s   @r   r  r    s    BS Sr   r  c                   "     e Zd ZdZ fdZ xZS )SlidingWindowCachea  
    Sliding Window Cache class to be used with `torch.compile` for models like Mistral that support sliding window attention.
    Every time when we try to update the cache, we compute the `indices` based on `cache_position >= self.sliding_window - 1`,
    if true(which means the cache can not hold all the old key value states and new states together because of the sliding window constraint),
    we need to do a cycle shift based on `indices` to replace the oldest states by the new key value states passed in.

    The `to_shift` is only true once we are above sliding_window. Thus with `sliding_window==64`:

    indices = (slicing + to_shift[-1].sum()-1) % self.sliding_window
    tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
        55, 56, 57, 58, 59, 60, 61, 62, 63,  0])

    We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SlidingWindowCache

        >>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

        >>> inputs = tokenizer(text="My name is Mistral", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
        >>> past_key_values = SlidingWindowCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        SlidingWindowCache()
        ```
    c                 0    t        |   |dt        i| y r  )r   r   r   r   s      r   r   zSlidingWindowCache.__init__  s    DK'9KFKr   r  r   s   @r   r  r    s    $LL Lr   r  c                   (     e Zd ZdZdef fdZ xZS )HybridCachea  
    Hybrid Cache class to be used with `torch.compile` for models that alternate between a local sliding window
    attention and global attention in every other layer (originally implemented for Gemma2).
    Under the hood, Hybrid Cache leverages ["SlidingWindowCache"] for sliding window attention and ["StaticCache"]
    for global attention. For more information, see the documentation of those layer types.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, HybridCache

        >>> model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")

        >>> inputs = tokenizer(text="My name is Gemma", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
        >>> past_key_values = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        HybridCache()
        ```
    rg  c                     t        |d      r!|j                  D cg c]  }t        |    }}nt        g|j                  z  }t        |   |||d| y c c}w )Nlayer_types)rg  rf  )hasattrr  LAYER_CLASS_MAPro   r   r   r   )r   rg  r   r   
layer_typerf  r   s         r   r   zHybridCache.__init__  s[    6=)KQK]K]^Z_Z8^M^ )MF,D,DDMdmUfU	 _s   A)r;   r<   r=   r>   r   r   r   r   s   @r   r  r    s    8V/ V Vr   r  c                       e Zd Zy)HybridChunkedCacheN)r;   r<   r=   r%   r   r   r  r  &  s    r   r  c                   $     e Zd ZdZd fdZ xZS )OffloadedHybridCachea  
    A drop-in replacement for HybridChunkedCache that conserves accelerator memory by offloading
    cache tensors to CPU when not actively being used.

    This cache maintains the compilation-friendly properties of HybridChunkedCache while enabling
    much longer sequences by offloading inactive layers to CPU memory.

    See `Cache` for details on common methods that are implemented by all cache classes.
    c                 0    t        |   |dt        i| y r  r  r   s      r   r   zOffloadedHybridCache.__init__4  r  r   r:   r  r   s   @r   r  r  )  s    S Sr   r  c                   $     e Zd ZdZd fdZ xZS )QuantizedCachea  
    A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750).
    It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.

    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length` is set as a maximum capacity for the
    original precision cache. When the length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache. The
    quantization is done per-channel with a set `q_group_size` for both Keys and Values, in contrast to what was described in the paper.

    It stores Keys and Values a list of quantized tensors (tuples in case we need to store metadata), one for each layer. Additionally, it stores the Key and
    Value in original precision states as a list of tensors, one for each layer. The size of each tensor
    is `[batch_size, num_heads, seq_len - residual_length, head_dim]`.

    See `Cache` for details on common methods that are implemented by all cache classes.
    c                 v    |dk(  rt         }n|dk(  rt        }nt        d| d      t        |   dd|i| y )Nr   hqqzUnknown quantization backend ``rJ  r%   )r&  r;  r   r   r   )r   r   r   	processorr   s       r   r   zQuantizedCache.__init__H  sG    h5I2I=gYaHII==f=r   r:   r  r   s   @r   r  r  8  s    > >r   r  c                       e Zd ZdZddZy)QuantoQuantizedCachea  
    A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://huggingface.co/papers/2402.02750).
    It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.

    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length` is set as a maximum capacity for the
    original precision cache. When the length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache. The
    quantization is done per-channel with a set `q_group_size` for both Keys and Values, in contrast to what was described in the paper.

    It stores Keys and Values a list of quantized tensors (tuples in case we need to store metadata), one for each layer. Additionally, it stores the Key and
    Value in original precision states as a list of tensors, one for each layer. The size of each tensor
    is `[batch_size, num_heads, seq_len - residual_length, head_dim]`

    Uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> # Run pip install quanto first if you don't have it yet
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig

        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_config = QuantizedCacheConfig(nbits=4)
        >>> past_key_values = QuantoQuantizedCache(cache_config=cache_config)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        QuantoQuantizedCache()
        ```
    Nc                 <    t        j                  | fdt        i| y r  )r  r   r&  r   r   s     r   r   zQuantoQuantizedCache.__init__x  s    d\4Q\U[\r   r:   r;   r<   r=   r>   r   r%   r   r   r  r  S  s    "H]r   r  c                       e Zd ZdZdddZy)HQQQuantizedCachea  
    A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750).
    It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.

    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length` is set as a maximum capacity for the
    original precision cache. When the length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache. The
    quantization is done per-channel with a set `q_group_size` for both Keys and Values, in contrast to what was described in the paper.

    It stores Keys and Values a list of quantized tensors (tuples in case we need to store metadata), one for each layer. Additionally, it stores the Key and
    Value in original precision states as a list of tensors, one for each layer. The size of each tensor
    is `[batch_size, num_heads, seq_len - residual_length, head_dim]`

    Uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> # Run pip install hqq first if you don't have it yet
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig

        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_config = QuantizedCacheConfig(nbits=4, axis_key=1, axis_value=1)
        >>> past_key_values = HQQQuantizedCache(cache_config=cache_config)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        HQQQuantizedCache()
        ```
    Nc                 J    |dk(  sJ t        j                  | fdt        i| y )NHQQrJ  )r  r   r;  )r   r   r   s      r   r   zHQQQuantizedCache.__init__  s*    %dY4NYRXYr   )r  r:   r  r%   r   r   r  r  |  s    "HZr   r  c                        e Zd ZdZdZdedef fdZd Zdede	e
j                  e
j                  e
j                  e
j                  f   fd	Zd
 Zde	e	e
j                        fdZede	e	e
j                   e
j                   f   df   dd fd       Zd"dee   defdZd Zde
j*                  fdZdefdZdefdZdededdfdZdefdZde
j                  fdZdefdZd e
j                  dede	eef   fd!Z xZS )#EncoderDecoderCachea[  
    Base, abstract class for all encoder-decoder caches. Can be used to hold combinations of self-attention and
    cross-attention caches.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM, DynamicCache, EncoderDecoderCache

        >>> model = AutoModelForCausalLM.from_pretrained("openai/whisper-small")
        >>> processor = AutoProcessor.from_pretrained("openai/whisper-small")

        >>> inputs = processor(audio=YOUR-AUDIO, return_tensors="pt")

        >>> # Prepare cache classes for encoder and decoder and pass it to model's forward
        >>> self_attention_cache = DynamicCache()
        >>> cross_attention_cache = DynamicCache()
        >>> past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access cache filled with key/values from generation
        EncoderDecoderCache()
        ```

    Nself_attention_cachecross_attention_cachec                    t         |   t               || _        || _        t        | j                  dd      | _        i | _        t        t        |            D ],  }t        |j                  |      dkD        | j                  |<   . y )N)rf  r?   Fr   )r   r   rH   r  r  rU  r?   
is_updatedrv  r   r$  r+   )r   r  r  r   r   s       r   r   zEncoderDecoderCache.__init__  s    |4$8!%:"%d&?&?AQSXYs#89: 	cI)-.C.R.RS\.]`a.a)bDOOI&	cr   c              #   V  K   t        t        |             D ]  }| j                  j                  |   j                  | j                  j                  |   j
                  | j                  j                  |   j                  | j                  j                  |   j
                  f  ywru  )rv  r   r  r   r   r   r  r"  s     r   rb  zEncoderDecoderCache.__iter__  s     
 s4y) 	I))00;@@))00;BB**11)<AA**11)<CC	 	s   B'B)r   r"   c                 f   |t        |       k  r| j                  j                  |   j                  | j                  j                  |   j                  | j
                  j                  |   j                  | j
                  j                  |   j                  fS t        dt        |        d|       rr  )r   r  r   r   r   r  rs  r"  s     r   rW  zEncoderDecoderCache.__getitem__  s    
 s4y ))00;@@))00;BB**11)<AA**11)<CC	  _SYK7efoepqrrr   c                 ,    t        | j                        S )rx  )r   r  r   s    r   r_  zEncoderDecoderCache.__len__  s    
 4,,--r   c                    d}t        | j                        dkD  rOt        | j                  j	                         | j                  j	                               D ]  \  }}|||z   fz  } |S | j                  j	                         }|S )z[Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.r%   r   )r   r  rY  r  r  )r   r  	self_attn
cross_attns       r   r  z#EncoderDecoderCache.to_legacy_cache  s    t))*Q.),))99;T=W=W=g=g=i* :%	: Z!7 99:   44DDFLr   r  .c                 R    | t               t                     }|t        t        |            D ]s  }||   dd \  }}|j                  j	                  |||       t        ||         dkD  s=||   dd \  }}|j
                  j	                  |||       d|j                  |<   u |S )zUConverts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.)r  r  Nr   T)r  rv  r   r  r'   r  r  r  s         r   r  z%EncoderDecoderCache.from_legacy_cache  s    
 !-"..
 &"3#78 7	+:9+Ebq+I(
L**11*lIVy12Q6/>y/I!"/M,J//66z<QZ[26E$$Y/7 r   c                 :    | j                   j                  ||      S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.)r  r+   r  s      r   r+   z"EncoderDecoderCache.get_seq_length  s     ((77	>RRr   c                    t        | j                  d      r| j                  j                          t        | j                  d      r| j                  j                          nnt        | j                  d      sXt        | j                  d      sBt	        d| j                  j                          d| j                  j                          d      | j                  D ]  }d| j                  |<    y )Nr2   zNeither self nor cross-attention cache have valid `.reset()` methods. `.reset()` should only be called on compatible cache classes, such as `StaticCache` or `SlidingWindowCache`. Got " for the self attention cache and  for the cross attention cache.F)r  r  r2   r  r   __str__r  r"  s     r   r2   zEncoderDecoderCache.reset  s    4,,g6%%++-4--w7&&,,.22G<WTMgMgipEq0088:;;]--55788WY   	/I).DOOI&	/r   r3   c                 p    | j                   j                  |       | j                  j                  |       y)rT   N)r  r9   r  rU   s     r   r9   z!EncoderDecoderCache.reorder_cache%  s*    !!//9""00:r   methodc           	          t        | j                  t              rt        | j                  t              sEt	        d| d| j                  j                          d| j                  j                          d      y )Nr  z)` is only defined for dynamic cache, got r  r  )r   r  r  r  r   r   )r   r  s     r   check_dynamic_cachez'EncoderDecoderCache.check_dynamic_cache*  sw    t00,?455|DF8DTE^E^EfEfEhDi j''+'A'A'I'I'K&LLkm  Er   maximum_lengthc                     | j                  | j                  j                         | j                  j                  |       y)z
        Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be
        negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search.
        N)r  rY   r;   r  )r   r  s     r   rY   zEncoderDecoderCache.crop5  s0    
 	  !3!34!!&&~6r   full_batch_size
split_sizezlist[EncoderDecoderCache]c                 "   | j                  | j                  j                         | j                  j                  ||      }| j                  j                  ||      }g }t        ||      D ]   \  }}|j                  t        ||             " |S )z
        Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`
        )r  batch_splitr;   r  r  rY  r   r  )r   r  r  r  r  outr  r  s           r   r
  zEncoderDecoderCache.batch_split=  s    
 	  !1!1!:!:;#88DD_V`a $ : : F FXb c%()=?T%U 	C!IzJJ*9jAB	C
r   rZ   c                     | j                  | j                  j                         | j                  j                  |       | j                  j                  |       y)zTRepeat the cache `repeats` times in the batch dimension. Used in contrastive search.N)r  r^   r;   r  r  r]   s     r   r^   z+EncoderDecoderCache.batch_repeat_interleaveK  sD      !=!=!F!FG!!99'B""::7Cr   r_   c                     | j                  | j                  j                         | j                  j                  |       | j                  j                  |       y)zXOnly keep the `indices` in the batch dimension of the cache. Used in contrastive search.N)r  rb   r;   r  r  ra   s     r   rb   z(EncoderDecoderCache.batch_select_indicesQ  sD      !:!:!C!CD!!66w?""77@r   c                 6    | j                   j                         S )zKReturns the maximum sequence length (i.e. max capacity) of the cache object)r  r-   r   s    r   r-   z'EncoderDecoderCache.get_max_cache_shapeW  s    ((<<>>r   r*   c                 :    | j                   j                  ||      S r$   )r  r/   )r   r*   r   s      r   r/   z"EncoderDecoderCache.get_mask_sizes[  s    ((77	RRr   r  ) r;   r<   r=   r>   r?   r   r   rb  rE   rD   r@   rA   rW  r_  r  rm   r  r  r	   r+   r2   rF   r9   rC   r  rY   r
  r^   rb   r-   r/   r   r   s   @r   r  r    s   8 NcU c5 csS sU5<<u||]b]i]i3i-j s.
uU\\':!; 
 #E%*;*;U=N=N*N$OQT$TU	 $S SSV S
/;e&6&6 ;
# 73 73 C D_ Ds DAELL A?S ?SU\\ Sc SeTWY\T\o Sr   r  ro  r   c                 2   	 t        t        j                  | j                        j                        dd }|D ci c]  }||v s|||    }}|j                         D ci c]  \  }}||vs|| }}}||fS # t
        $ r i |fcY S w xY wc c}w c c}}w )a  
    Parse processor arguments from kwargs based on the processor class init signature.

    Args:
        processor_class: The processor class to inspect, or None
        kwargs: Dictionary of keyword arguments

    Returns:
        tuple: (processor_kwargs, remaining_kwargs)
    r   N)r  inspect	signaturer   
parameters	Exceptionitems)ro  r   paramskrp  vremaining_kwargss          r   rl  rl  _  s    g''(@(@ALLMabQ /5DV6!9DD)/UA1DT;T1UU---  6z EUs(   5A; 	BB"B/B;B
Brg  rr   rq   r6   ru   rh  ri  ry   c                 b   | i S t        | dd      4d| j                  v r&d| j                  v rt        | dd      t        d      |xs | j                  }t        | dd      | j                  n| j
                  | j                  z  }t        | dd      | j                  n| j                  }	|$|d	kD  r|	|z  d
k7  rt        d|	 d| d      |	|z  }	||n|||t        j                  |      nd||||	t        | dd      d}
|
j                         D ci c]  \  }}|	|| c}}S c c}}w )a5  
    Parse layer arguments from model configuration for cache initialization.

    Args:
        config (`Optional[PretrainedConfig]`): Model configuration containing shape/device info.
        batch_size (`Optional[int]`): Batch size for cache initialization.
        max_cache_len (`Optional[int]`): Maximum sequence length for cache.
        device (`Union[torch.device, str, None]`): Device for cache tensors.
        dtype (`Optional[torch.dtype]`): Data type for cache tensors.
        layer_device_map: Per-layer device mapping.
        tp_size (`Optional[int]`): Tensor parallel size to adjust number of key/value heads.
        max_batch_size (`Optional[int]`): Maximum batch size for cache initialization.

    Returns:
        `dict`: Dictionary containing parsed layer arguments for cache initialization.
    Nr  sliding_attentionfull_attentionrv   zSetting up a hybrid or sliding window KVCache requires the model config supporting sliding window attention, please check if there is a `sliding_window` field in the model config and it's not set to None.rt   num_key_value_headsr   r   zNumber of key value heads z+ must be divisible by tensor parallel size r   )rr   rq   r6   ru   rh  rt   rs   rv   )rU  r  r   max_position_embeddingsrt   hidden_sizenum_attention_headsr  r@   r6   r  )rg  rr   rq   r6   ru   rh  ri  ry   rt   rs   
layer_argsr  r  s                r   rm  rm  t  s   6 ~	
 FM40<#v'9'99 F$6$66v/6> 7  &G)G)G vz40< OO##v'A'AA 	 v4d;C &&++ 	
 7Q;7"a' 0;fgnfoopq  '!I,:,F.J*.4.@ell6*d 0 "%f.>E	

 ",!1!1!3EAq}1EEEs   
D+"D+)r  r  chunked_attentionr  )	offloadedquanto_quantizedhqq_quantizedrk  c                       e Zd ZdZddZy)	SinkCachea  
    Is its now a `custom_generate` repository on the Hub: https://huggingface.co/transformers-community/sink_cache.
    See [these docs](https://huggingface.co/docs/transformers/generation_strategies#custom-decoding-methods) for
    general `custom_generate`usage.
    Nc                     t        d      )Nz`SinkCache` has been moved as a `custom_generate` repository on the Hub: https://huggingface.co/transformers-community/sink_cache. See the repository for usage examples.r  r  s     r   r   zSinkCache.__init__  s    !o
 	
r   r:   r  r%   r   r   r'  r'    s    
r   r'  c                       e Zd ZU dZded<   d Zed        Zdee	e
j                  f   fdZdee	ef   fd	Zd
 Zd Zd Zd Zy)CacheConfigzT
    Base class for cache configs. Deprecated in favor of a simpler dictionary.
    Ncache_implementationc                 .    t         j                  d       y NZCacheConfig is deprecated and will be removed in v4.55.0 in favor of a simpler dictionary.)r  r  r   s    r   __post_init__zCacheConfig.__post_init__  s    h	
r   c                     t         j                  d        | di |}g }|j                         D ]0  \  }}t        ||      st	        |||       |j                  |       2 |D ]  }|j                  |d        |S )ar  
        Constructs a CacheConfig instance from a dictionary of parameters.
        Args:
            config_dict (dict[str, Any]): Dictionary containing configuration parameters.
            **kwargs: Additional keyword arguments to override dictionary values.

        Returns:
            CacheConfig: Instance of CacheConfig constructed from the dictionary.
        r.  Nr%   )r  r  r  r  rZ  r   r   )ri   config_dictr   rg  	to_remover  r[  s          r   	from_dictzCacheConfig.from_dict  s     	h	
 #{#	 ,,. 	&JCvs#U+  %	&  	"CJJsD!	"r   json_file_pathc                     t        |dd      5 }| j                         }t        j                  |dd      dz   }|j	                  |       ddd       y# 1 sw Y   yxY w)	a  
        Save this instance to a JSON file.

        Args:
            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this configuration instance's parameters will be saved.
            use_diff (`bool`, *optional*, defaults to `True`):
                If set to `True`, only the difference between the config instance and the default
                `QuantizationConfig()` is serialized to JSON file.
        wzutf-8)encodingr   T)indent	sort_keys
N)opento_dictjsondumpswrite)r   r4  writerr1  json_strings        r   to_json_filezCacheConfig.to_json_file  sU     .#8 	&F,,.K**[dKdRKLL%		& 	& 	&s   =AAr"   c                 @    t        j                  | j                        S )z
        Serializes this instance to a Python dictionary. Returns:
            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        )r  deepcopy__dict__r   s    r   r<  zCacheConfig.to_dict  s    
 }}T]]++r   c              #      K   t        j                  | j                        j                         D ]  \  }}||f  yw)zTallows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixinN)r  rD  rE  r  )r   attrr[  s      r   rb  zCacheConfig.__iter__  s9     ==7==? 	KD%+	s   =?c                 T    | j                   j                   d| j                          S )N )r   r;   to_json_stringr   s    r   r}  zCacheConfig.__repr__$  s(    ..))*!D,?,?,A+BCCr   c                 J    t        j                  | j                  d      dz   S )z
        Serializes this instance to a JSON formatted string.
        Returns:
            str: JSON formatted string representing the configuration instance.
        r   )r8  r:  )r=  r>  rE  r   s    r   rJ  zCacheConfig.to_json_string'  s     zz$--2T99r   c                     g }|j                         D ]0  \  }}t        | |      st        | ||       |j                  |       2 |j                         D ci c]  \  }}||vs|| }}}|S c c}}w )a  
        Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
        returning all the unused kwargs.

        Args:
            kwargs (`dict[str, Any]`):
                Dictionary of attributes to tentatively update this class.

        Returns:
            `dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
        )r  r  rZ  r   )r   r   r2  r  r[  unused_kwargss         r   r'   zCacheConfig.update0  s}     	 ,,. 	&JCtS!c5)  %	& 7=lln]
US\H\e]] ^s   A0&A0)r;   r<   r=   r>   __annotations__r/  rm   r3  r
   rC   osPathLikerB  rB   r   r<  rb  r}  rJ  r'   r%   r   r   r*  r*    sk     

  0&5bkk1A+B &$,c3h ,D:r   r*  c                       e Zd ZdZddddddej
                  dfded	ee   d
ee   dee   dee   dee   deej                     dee   fdZ
d Zy)QuantizedCacheConfiga4  
    Configuration class for quantized cache settings. Deprecated in favor of a simpler dictionary.

    Attributes:
        backend (`str`, *optional*, defaults to `"quanto"`):
            Backend to use when performing quantization, Can be one of [`quanto`, `HQQ`]
        nbits (`Optional[int]`, *optional*, defaults to 4):
            Number of bits, can be 2 or 4 for the `quanto` backend and one of [1, 2, 3, 4, 8] for the `HQQ` backend. Defaults to 2.
        axis_key (`int`, *optional*, defaults to 0):
            Axis over which to perform grouping for the key tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend.
        axis_value (`int`, *optional*, defaults to 0):
            Axis over which to perform grouping for the value tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend.
        q_group_size (`Optional[int]`, *optional*, defaults to 64):
            Size of the quantization group, should be a divisor of the model's hidden dimension.
            Defaults to 64.
        residual_length (`Optional[int]`, *optional*, defaults to 128):
            Length of the residual cache which will always be stored in original precision.
            Defaults to 128.
        compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
        device (`str`, *optional*, defaults to `"cpu"`):
            Device on which to perform computations, should be same as the model's device.
    r   r   r   r   r   rp   r   r   r   r   r   r   r   r6   c	                     t         j                  d       || _        || _        || _        || _        || _        || _        || _        || _	        y r-  )
r  r  r   r   r   r   r   r   r   r6   )	r   r   r   r   r   r   r   r   r6   s	            r   r   zQuantizedCacheConfig.__init__a  sR     	h	
 
 $(.*r   c                    d}| j                   dvr't        |j                  dd| j                               | j                  dk  r't        |j                  dd| j                              | j                  dk  r't        |j                  d	d| j                              | j
                  d
vr't        |j                  dd| j
                              | j                  d
vr't        |j                  dd| j                              yr  r  r  s     r   r  zQuantizedCacheConfig.validatex  r  r   N)r;   r<   r=   r>   r@   r#  rC   r	   rE   ru   r   r  r%   r   r   rR  rR  G  s    4   "#$%&(),/4}} % } 3-	
 SM sm "#  , .2r   rR  c                   R    e Zd ZdZdZddedefdZd Zd Zded	d
fdZ	ded	d
fdZ
y
)StaticCacheConfigz8
    Configuration class for static cache settings.
    staticrr   rq   c                 X    t         j                  d       || _        || _        || _        y r-  )r  r  rr   rq   r6   )r   rr   rq   r6   s       r   r   zStaticCacheConfig.__init__  s+    h	
 %*r   c                    t        | j                        |kD  ry|j                  d   }| j                  |   r|j                  n| j
                  }| j                  |    }| j                  || j                  | j                  f}| j                  || j                  | j                  f}| j                  |   r|n|}t        j                  || j                  ||      }	t        j                  || j                  ||      }
t        j                  j                  |	       t        j                  j                  |
       | j                  j                  |	       | j                   j                  |
       | j"                  | j                  |   sg | _        g | _        t'        d      D ]  }t        j                  || j                  |j                        }t        j                  || j                  |j                        }t        j                  j                  |	       t        j                  j                  |
       | j"                  j                  |       | j$                  j                  |        yyy)zIOverridden to use the correct device if offloaded layer (and pin memory).Nr   )ru   r6   
pin_memoryr   rx   )r   ry  rP   rl   r6   r   ry   rq   rt   rv   r@   rz   _dtyper{   r|   r   r  device_key_cachedevice_value_cacherv  )r   r   r   r  r6   rZ  global_cache_shapesliding_cache_shapecache_shapenew_layer_key_cachenew_layer_value_cache_device_layer_key_cachedevice_layer_value_caches                 r   initialise_cache_layerz(StaticCacheConfig.initialise_cache_layer  s	   t~~*(..q1&*ooi&@""dFYFY33
"113FHZHZ\`\i\ij#224GI\I\^b^k^kl .2__Y-G)M_#kk+T[[QWdno %Kt{{SYfp q))*=>))*?@12 56   (1K$&D!&(D#1X I).[\f\m\m)n&+0;;{$++^h^o^o+p(112EF112GH%%,,-CD''../GHI	 2L(r   c                 $   | j                   Bt        j                  j                  |j                        j                  | j                          | j                  | j                     }| j                  | j                     }| j                  |       ||d d d d |f<   ||d d d d |f<   |j                  | j                        | j                  |   d d d d |f<   |j                  | j                        | j                  |   d d d d |f<   ||fS r$   )_prefetch_streamr@   r   default_streamr6   wait_streamr\  active_device_layerr]  _prefetch_next_layerr8   r   ry  r  )r   r*   r   r   r    k_outv_outrq   s           r   _static_updatez StaticCacheConfig._static_update  s      ,JJ%%j&7&78DDTEZEZ[ %%d&>&>?''(@(@A 	!!), '1aN"#&2aN"# ;E--H[H[:\y!!Q"67<HOODL_L_<`#Aq.$89e|r   r   r"   Nc                    | j                   dk(  rdnd| _         	 |dz   | j                  |dz   d j                  d      z   }| j                  Dt
        j                  j                  | j                        5  | j                  |       ddd       y| j                  |       y# t        $ r | j                  j                  d      }Y w xY w# 1 sw Y   yxY w)zCBased on current layer_idx, prefetch next full layer to the device.r   r   NF)	rk  rl   indexr   rh  r@   r   r   _prefetch_layer_in_context)r   r   
next_layers      r   rl  z&StaticCacheConfig._prefetch_next_layer  s     )-(@(@A(E11 	6"QQ)I)O)OPU)VVJ   ,""4#8#89 <//
;< < ++J7  	6..u5J	6
< <s   'B# 6C#$C
	C
Cc                    t        | j                        |kD  ro| j                  | j                     j	                  | j                  |   d       | j
                  | j                     j	                  | j                  |   d       y| j                  | j                     j                  d       | j
                  | j                     j                  d       y)z6Performs the actual copy of the layer to device cache.Tr   g        N)r   ry  r\  rk  r   r]  r  fill_r"  s     r   rr  z,StaticCacheConfig._prefetch_layer_in_context	  s    t~~*!!$":":;AA$..QZB[jnAo##D$<$<=CCDDTDTU^D_nrCs !!$":":;AA#F##D$<$<=CCCHr   r   )r;   r<   r=   r>   r+  rE   r   rf  ro  rl  rr  r%   r   r   rV  rV    sW     $3 s IB,8c 8d 8(IC ID Ir   rV  namec                 x    | dk(  r!t         j                  d        G d d      }|S t        dt        d|       )N
MambaCachezImporting `MambaCache` from `transformers.cache_utils` is deprecated and will be removed in a future version. Please import it from `transformers` or `transformers.models.mamba.cache_mamba` instead.c            
           e Zd ZdZdZej                  dfdedej                  de	ej                  edf   fdZded	ej                  d
ej                  dej                  fdZdedej                  fdZd Zy)__getattr__.<locals>.MambaCachea4  
            Importing `MambaCache` from `transformers.cache_utils` is deprecated and will be removed
            in a future version. Please import it from `transformers` or `transformers.models.mamba.cache_mamba` instead.

            Cache for mamba model which does not have attention mechanism and key value states.

            Arguments:
                config (`PretrainedConfig):
                    The configuration file defining the shape-related attributes required to initialize the static cache.
                max_batch_size (`int`):
                    The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a smaller batch size is used.
                dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
                    The default `dtype` to use when initializing the layer.
                device (`torch.device` or `str`, *optional*):
                    The device on which the cache should be initialized. Should be the same as the layer.

            Example:

                ```python
                >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

                >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
                >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

                >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

                >>> # Prepare a cache class and pass it to model's forward
                >>> past_key_values = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
                >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
                >>> outputs.past_key_values
                MambaCache()
                ```
            TNry   ru   r6   c                    || _         || _        |j                  | _        |j                  | _        |j
                  | _        g | _        g | _        |t        j                  |      nd }t        |j                        D ]  }t        j                  | j                   | j                  | j                  || j                        }t        j                  | j                   | j                  | j                  || j                        }t        j                  j                  |       t        j                  j                  |       | j                  j!                  |       | j                  j!                  |        y )Nr6   ru   )ry   r[  intermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statesr@   r6   rv  r   rz   r{   r|   r   )r   rg  ry   ru   r6   rc  
conv_state	ssm_states           r   r   z(__getattr__.<locals>.MambaCache.__init__B	  s/    '5##)/)A)A&&,&7&7#(.(:(:%79 68171Cf-v778 6A/4{{++..--%"kk0J /4kk++..++%"kk/I MM55jAMM55i@$$++J7OO**95'6r   r   new_conv_stater*   r"   c                 "   | j                   |   j                  |j                  k7  r5| j                   |   j                  |j                        | j                   |<   | j                   |   }|j                  d| j                  dz
        }|j                  dd      }|j                  |j                  |j                        |d d d d |f<   | j                   |   j                          | j                   |xx   |z  cc<   | j                   |   S )Nr   r   rR   )shiftsdimsr|  )r  r6   r8   r   r  rollru   r1   )r   r   r  r*   r  s        r   update_conv_statez1__getattr__.<locals>.MambaCache.update_conv_stateg	  s    
 ##I.559N9NN262B2B92M2P2PQ_QfQf2gD$$Y/!--i8
!/!5!5a9N9NQR9R!S'__BR_@
3A3D3DJL]L]eoeueu3D3v
1a/0  +113  +z9+''	22r   new_ssm_statec                     |j                  | j                  |   j                        | j                  |<   | j                  |   S r$   )r8   r  r6   )r   r   r  s      r   update_ssm_statez0__getattr__.<locals>.MambaCache.update_ssm_statex	  s9    -:-=-=dooi>X>_>_-`	*y11r   c                     t        t        | j                              D ]<  }| j                  |   j                          | j                  |   j                          > y r$   )rv  r   r  r1   r  r"  s     r   r2   z%__getattr__.<locals>.MambaCache.reset|	  sM    !&s4+;+;'<!= 7I$$Y/557OOI.4467r   )r;   r<   r=   r>   r?   r@   r#  rE   ru   r
   r6   rC   r   rA   rF   r  r  r2   r%   r   r   rx  rz  	  s     D "N &+]]9=#6 !$#6 {{	#6
 ellC56#6J3!$36;ll3TYTdTd33"2# 2ell 27r   zmodule z has no attribute )r  r  AttributeErrorr;   )rv  rx  s     r   __getattr__r  	  sL    ||	

d	7 d	7L 
78,.@I
JJr   )NNNNNNN)Xr  rL  importlib.metadatar,  r  r=  rO  abcr   r   collections.abcr   dataclassesr   typingr   r   r	   r
   r@   	packagingr   transformers.pytorch_utilsr   configuration_utilsr   r  r   r   r   r   hqq.core.quantizer   r=  
get_loggerr;   r  r   rH   ro   r   r   r   r   r   r&  r;  rD   rA   rN  rP  r   r  r  r  Contextr  register_pytree_noder<   r  register_pytree_flatten_specr  r  r  r  r  r  r  r  r  r  r  r  rB   rl  rE   r6   rC   ru   rm  r  rN  rk  r'  r*  rR  rV  r  r%   r   r   <module>r     si        	 # $ ! 1 1   I 1 d d ; 
		H	%%Kc %KPk? k\D$/ D$N^$ ^$BS$, S$l<* <*~fon foRv6n v6r;$$; ;$|;!8 ;|eELL%,,6778c5u||3445>! !@N ND=5 =D U#
| 
$$,, 
KK,,_  , 7 78,:O:O9PQ
 -  
HH11k
B\ B$E% E<#S; #SL(L (LV#V% #VN + *S- S>\ >6&]> &]R'Z 'ZTvS% vSr.(48H3I*J .TX .]bcgimcm]n .. !%#'-1#':>!$(IF%&IFIF C=IF %,,T)*	IF
 EKK IF tC$567IF c]IF SMIF 
IFZ "+,7c4 1223  )//: T#t$4556 
 
 e e eP b; b bJ bI bI bINnKc nKc nKr   