
    rh                        d dl Z ddlmZ ddlmZ 	  e       rd dlmZ 	 	 	 	 	 	 	 	 dde j                  j                  de j                  de j                  d	e j                  d
e j                  dede j                  fdZy# e$ r Y iw xY w)    N   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funcmoduleqkvattention_maskcachereturnc           	          |j                   ||| j                  fd|i|\  }}t        | dd      sdn| j                  df}||j                  }d|j                  d      i} |j                  dd	      j                  d      j                         |j                  dd	      j                  d      j                         |j                  dd	      j                  d      j                         |j                  t        j                        |j                  t        j                        j                         ||	f| j                  d
|d|}t        |t              r|d   }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cumulative_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cumulative_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    cumulative_seqlens_ksliding_windowF)r   r   Ns_aux   r   T)softmax_scalecausalwindow_size)update	layer_idxgetattrr   r   get	transposesqueeze
contiguoustotorchint32clonescaling
isinstancetuple)r   r   r	   r
   r   r   cumulative_seqlens_qr   max_seqlen_qmax_seqlen_kblock_tablesimplementationkwargsr   r   custom_kwargsattn_outputs                    x/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardr.      sU   J 5<<1f..dEYd]cdDAq%,V5Eu%MXTZTiTiklSmN!!/!F!Ffjj12M(	Aq!!!$//1	Aq!!!$//1	Aq!!!$//1,,224 nn" K +u%!!n    )NNNNNNNN)r   generation.continuous_batchingr   utilsr   
flash_attnr   	ExceptionnnModuleTensorr.    r/   r-   <module>r8      s     @ -	 "5 $(!%;HHOO;||; ||; ||	;
 LL; ; \\;	  		s   B B	B	