
    rhQ              (          d dl Z d dlZd dlZd dlmZmZ d dlZd dlmc m	Z
 d dlmZ ddlmZmZmZmZmZmZ  ej(                  e      Zdej.                  dej.                  dej.                  fd	Zd0d
Zd Zdej.                  deej.                  ej.                  ef   fdZdej.                  dej.                  dej.                  dej.                  def
dZd Zd Z d0deejB                     fdZ"dee#   fdZ$dZ%d Z&d Z' G d ded      Z(	 	 	 	 	 	 	 	 	 	 	 	 	 d1dej.                  d ej.                  d!ej.                  deej.                     ded"e)d#e*d$eej.                     d%ee*   d&ee   d'e)d(ee*   d)ee)   d*eejV                     d+eejV                     d,ee   d-ee   deejB                     d.ee#   f&d/Z,y)2    N)Optional	TypedDict)is_kernels_available   )is_flash_attn_2_availableis_flash_attn_3_availableis_flash_attn_greater_or_equal#is_flash_attn_greater_or_equal_2_10is_torch_npu_availableloggingtensorindicesreturnc                 h     | j                         j                  dg| j                  dd   }||   S )N   )
contiguousreshapeshape)r   r   reshapeds      ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/modeling_flash_attention_utils.py_index_first_axisr   %   s8    *v  "**2AQR0@AHG    c                    |||z   n|}|j                  dt        j                        }|j                  dt        j                        }t        j                  |j	                         d      j	                         }|j                         j                         }t        j                  t        j                  |dt        j                        d      }t        | |      ||||fS )a  
    FA3-compatible unpad_input function.
    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    r   dimdtypeFas_tupler   r   r   )sumtorchint32nonzeroflattenmaxitemFpadcumsumr   )	hidden_statesattention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr   max_seqlen_in_batch
cu_seqlenss	            r   _fa3_unpad_inputr3   *   s     3>2I+-~I }}5;;}?*..2U[[.ImmI--/%@HHJG*..0557u||$4!5;;OQWXJ 	-1 r   c                     | j                   dd }t        j                  ||z  g|| j                  | j                  d}| ||<    |j
                  ||g| S )a  
    FA3-compatible pad_input function.
    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.
    Return:
        hidden_states: (batch, seqlen, ...)
    r   Ndevicer   )r   r"   zerosr6   r   view)r+   r   batchseqlenr   outputs         r   _fa3_pad_inputr<   H   sb     

ab
!C[[%&.hCh8L8LTaTgTghF#F7O6;;uf+s++r   r,   c                 d   | j                  dt        j                        }t        j                  | j	                         d      j	                         }|j                         j                         }t        j                  t        j                  |dt        j                        d      }|||fS )ao  
    Retrieves indexing data required to repad unpadded (ragged) tensors.
    Arguments:
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    Return:
        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        max_seqlen_in_batch (`int`):
            Maximum sequence length in batch.
    r   r   Fr   r   r    )
r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   )r,   r/   r   r1   r2   s        r   _get_unpad_datar>   Y   s     &))b)DmmN224uEMMOG +..0557u||$4!5;;OQWXJ r   query_layer	key_layervalue_layerquery_lengthc                    t        |      \  }}}|j                  d   |j                  d   x}	kD  r"|ddd|	ddddf   |ddd|	ddddf   }}|j                  \  }
}}}t        ||      }t        ||      }||k(  rt        | |      } |}|}|}nk|dk(  rLd}t        j                  |
dz   t        j
                  | j                        }|dd }| j                  d      } n|dd| df   } || |      ^} }}}}| |||||f||ffS )a  
    Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.
    This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
    tensors for query, key, value tensors.
    Arguments:
        query_layer (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
        query_length (`int`):
            Target length.
        unpad_input_func:
            The function to use for unpadding the input tensors.
    Return:
        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   N)r   r6   )r>   r   r   r"   aranger#   r6   squeeze)r?   r@   rA   r,   rB   unpad_input_func	indices_kcu_seqlens_kmax_seqlen_in_batch_kseq_len
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_q_s                     r   _upad_inputrS   t   s`   N 6E^5T2I|2 q(<(<R(@@WA!*1hwh1+<!={1hwhXY[\K\?];	<EOO9J
/!)Y7I#K;Kz!'Y?# 5			 !||N%++k6H6H
 !"%	!))!, (L=>(9:JZ[fhvJwGY.Ca 		|$	 56 r   c                 <   | j                         j                  d| j                  d      | j                  d            } |j                         j                  d|j                  d      |j                  d            }|j                         j                  d|j                  d      |j                  d            }|j                         }t	        j
                  |j                  d      |j                  t        j                        }t	        j                  ||dk(     t	        j                  |j                         |j                  t        j                        f      }|j                         j                         j                         }| |||||f||ffS )aF  
    This function returns necessary arguments to call `flash_attn_varlen_func`.
    All three query, key, value states will be flattened.
    Cumulative lengths of each examples in the batch will be extracted from position_ids.
    NOTE: ideally cumulative lengths should be prepared at the data collator stage
    Arguments:
        query (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    Return:
        query (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   r5   )r   r8   sizer%   r"   rD   r6   r#   catr   diffr&   r'   )querykeyvalueposition_idsrQ   cu_seq_lens
max_lengths          r   _prepare_from_posidsr_      sG   : ##B

2

2GE
..


CHHRL#((2,
?C##B

2

2GE'')L\..q1,:M:MUZU`U`aI))la'(LL**,\5H5HPUP[P[\	
K !!#'')..0J3y;*DzS]F^__r   c                 R    t        j                  dt               t        | |||      S )NzEprepare_fa2_from_position_ids is deprecated, use _prepare_from_posids)warningswarnFutureWarningr_   )rY   rZ   r[   r\   s       r   *_prepare_flash_attention_from_position_idsrd      s'    MMO  sE<@@r   target_dtypec                     |ri| j                   t        j                  k(  rLt        j	                  d| d       | j                  |      |j                  |      |j                  |      }}} | ||fS )NzCasting fp32 inputs back to z for flash-attn compatibility.)r   r"   float32loggerwarning_onceto)qkvre   s       r   fa_peft_integration_checkrn      s`    5==0:<.Hfgh$$|$add<&8!$$|:La1a7Nr   implc                    t               xs
 t               }t               }| dk(  s| |r|s	 ddlm}m} ddlm}m} ||||dfS | dk(  s| |rddlm}m} t$        t&        }}||||dfS t$        t&        }}t)        | dd       t)        | d      ||dfS # t        $ r}t               j                  dd       s6t        d      j                         j                         }|dv t               d<   t               d   rPt               st        d	      dd
lm}	  |	d      } t$        t&        }}t)        | dd       t)        | d      ||dfcY d }~S t        d      |d }~ww xY w)Nflash_attention_2r   )flash_attn_funcflash_attn_varlen_func)	pad_inputunpad_inputFuse_remote_fa2zUnable to import the official flash attention, do you want to try to use `kernels-community/flash-attn` (trust remote code) Yes or No? >   1yyesz2You need to install kernels: `pip install kernels`)
get_kernelzkernels-community/flash-attnrr   rs   TzTFailed to import flash attention 2, please install it or use another implementation.flash_attention_3)r   r   r   
flash_attnrr   rs   flash_attn.bert_paddingrt   ru   ImportErrorglobalsgetinputstriplowerr   kernelsrz   r<   r3   getattrflash_attn_interface)
ro   is_fa2is_fa3rr   rs   rt   ru   erv   rz   s
             r   _lazy_importsr     s   &(D,B,DF&(F""t|v"	JF"$:I{TYYY> ""t|P!/1A;	 6	;PTTT!/1A;	D+T2D23
 	
I  	9==!148 b UWUW  /=@Q.Q	*+y)*+-%&Z[[.!"@A)79I;	D"3T:D":;  "j5	s$   B 	EB'E EEEc                  F    t               xs t               xs
 t               S N)r   r   r    r   r   is_flash_attn_availabler   A  s    $&a*C*EaI_Iaar   c                  \    t               ryt               rt                S ddlm}   |        S )NFr   'is_npu_fa2_top_left_aligned_causal_mask)r   r   r
    integrations.npu_flash_attentionr   r   s    r   !flash_attn_supports_top_left_maskr   E  s)     " "6888Y244r   c                   V    e Zd ZU eej
                     ed<   eej
                     ed<   y)FlashAttentionKwargscumulative_seqlens_qcumulative_seqlens_kN)__name__
__module____qualname__r   r"   
LongTensor__annotations__r   r   r   r   r   P  s$    "5#3#344"5#3#344r   r   F)totalquery_states
key_statesvalue_states	is_causaldropoutr\   softmax_scalesliding_windowuse_top_left_masksoftcapdeterministiccu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kimplementationc           
      ,   t        d dD              st        |      \  }}}}}|t               d<   |t               d<   |t               d<   |t               d<   |t               d<   dt        j                  |      j
                  v }|t               d	<   nNt               d   }t               d   }t               d   }t               d   }t               d   }t               d	   }|xr
 |
xr |d
k(   }t        xs |xr |	xr |j                  d
   |	kD  }|rd|	|	fini }|s||d<   t        d      r"||nt        j                  dd      dk(  }||d<   |||d<   d|v r|j                  d      |d<   t        | |||      \  } }}|d uxs t        d ||||fD              }|t        | |||||      \  }} }!}"\  }#}$\  }%}&dt        |j                        v r|$j!                         }$ ||| |!f|#j#                  t$        j&                        |$j#                  t$        j&                        |%|&||d|}'t)        |'t*              r|'d   }' ||'|"| j                  d   |      }(n|r||)|t-        d      t/        | |||      \  }} }!}"\  }#}$\  }%}&n| j1                  d| j3                  d      | j3                  d            }|j1                  d|j3                  d      |j3                  d            } |j1                  d|j3                  d      |j3                  d            }!||}&}%||}$}#dt        |j                        v r|$j!                         }$ ||| |!f|#j#                  t$        j&                        |$j#                  t$        j&                        |%|&||d|}(t)        |(t*              r|(d   }(|(j5                  | j                  d   d|(j3                  d      |(j3                  d            }(n || ||f||d|}(t)        |(t*              r|(d   S |(S )Nc              3   4   K   | ]  }|t               v   y wr   )r   .0rl   s     r   	<genexpr>z+_flash_attention_forward.<locals>.<genexpr>k  s     l!qGI~ls   )	_flash_fn_flash_varlen_fn_pad_fn	_unpad_fn_is_fa3r   r   r   r   r   window_size_flash_supports_windowr   	dropout_pz2.4.1FLASH_ATTENTION_DETERMINISTIC0rw   r   r   s_auxc              3   $   K   | ]  }|d u 
 y wr   r   r   s     r   r   z+_flash_attention_forward.<locals>.<genexpr>  s      //s   mps)rO   rH   max_seqlen_qmax_seqlen_kr   causalr   zePosition ids should be passed if the attention mask is not passed and the cu_seq-lens are not passed.r   rU   )r   r   )allr   r   inspect	signature
parametersr   r   r	   osgetenvr   rn   rS   strr6   clonerj   r"   r#   
isinstancetuple
ValueErrorr_   r   rV   r8   ))r   r   r   r,   rB   r   r   r\   r   r   r   r   r   r   r   r   r   re   r   kwargsflash_fnflash_varlen_fnpad_fnunpad_fnr   flash_supports_windowr   use_swflash_kwargsdetuse_maskrk   rl   rm   idxcu_qcu_kmqmk	out_unpadouts)                                            r   _flash_attention_forwardr   U  s   , l'kll>KN>[;/68V!)	+(7	$%%	)!)	+%	) -1B1B?1S1^1^ ^.C	*+9[)!)$679%9[)9% '	*B CH 1 Gla6GHF		8#8unuQ[QaQabcQdguQu  IOMNN#CDTVL$+[!%g.,8mbiiHgil>mqt>t(+_%")Y& &

7 3W-Fj,.*L*l 4' 3 / -}lLY/ ,H !/:*lNLRZ0
,1altTHR CM!::<D#
 --'
 
	 i'!!IY\%7%7%:LI	 M$9# {  4Hj,40Aq!S,4R $$R):):2)>@Q@QRT@UVA""2zr':JOOB<OPA$$R):):2)>@Q@QRT@UVA!<B&$DCM!::<D
 --'
 
 c5!a&Chh|))!,b#((2,M*l
BOX^
bn
  U+3q644r   r   )g        NNNFNNNNNNNN)-r   r   ra   typingr   r   r"   torch.nn.functionalnn
functionalr(   transformers.utils.import_utilsr   utilsr   r   r	   r
   r   r   
get_loggerr   rh   Tensorr   r3   r<   r   intr>   rS   r_   rd   r   rn   r   r   r   r   r   r   boolfloatr   r   r   r   r   <module>r      s    	  &    @  
		H	%ell U\\ ell 
<,"ELL U5<<WZ;Z5[ 6JJ||J J LL	J
 JZ3`lAXekk5J 5
 5
p  b559E 5 +/%)$(##$(0404"&"&*.$('v5,,v5v5 ,,v5 U\\*	v5
 v5 v5 v5 5<<(v5 E?v5 SMv5 v5 e_v5 D>v5 E,,-v5 E,,-v5  3-!v5" 3-#v5$ 5;;'%v5& SM'v5r   