
    rh                     N   d dl mZ d dlZddlmZmZ ddlmZ  ej                  e	      Z
 e       Z	 	 	 	 ddej                  j                  dej                  dej                  d	ej                  d
eej                     dedee   dee   dee   deej                  df   fdZy)    )OptionalN   )_flash_attention_forward!flash_attn_supports_top_left_mask)loggingmodulequerykeyvalueattention_maskdropoutscalingsliding_windowsoftcapreturnc	                 p   |	j                  dd      s|	j                  d      t        j                  d       |j                  d   }
t	        d |j                  D              rt        d      |j                  dd      }|j                  dd      }|j                  dd      }d }|j                  t        j                  k(  rt        j                         rt        j                         }nat        | j                  d	      r| j                  j                  }n4t        d
 | j!                         D              j"                  j                  }|	j%                  dd        t'        ||||f|
| j(                  ||||t*        || j                  j,                  t        | d      r| j.                  nd d
|	}|d fS )Noutput_attentionsF	head_maskz`flash_attention_2` does not support `output_attentions=True` or `head_mask`. Please set your attention to `eager` if you want any of these features.r   c              3   &   K   | ]	  }|d k(    yw)r   N ).0dims     |/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/flash_attention.py	<genexpr>z*flash_attention_forward.<locals>.<genexpr>#   s     
+3!8
+s   zTensor query has shape  with a zero dimension.
FlashAttention does not support inputs with dim=0.
Please check your input shapes or use SDPA instead.   _pre_quantization_dtypec              3   j   K   | ]+  }t        |t        j                  j                        s(| - y w)N)
isinstancetorchnnLinear)r   layers     r   r   z*flash_attention_forward.<locals>.<genexpr>;   s'     j%zRWY^YaYaYhYhGijs   )33	is_causal	layer_idx)
query_lengthr#   r   softmax_scaler   r   use_top_left_masktarget_dtypeattn_implementationr$   )getloggerwarning_onceshapeany
ValueError	transposedtyper   float32is_autocast_enabledget_autocast_gpu_dtypehasattrconfigr   nextmodulesweightpopr   r#   _use_top_left_mask_attn_implementationr$   )r   r	   r
   r   r   r   r   r   r   kwargsseq_lenr(   attn_outputs                r   flash_attention_forwardr@      s    zz%u-K1H1TW	
 kk!nG

+u{{
++B
 	
 OOAq!E
--1
COOAq!E L{{emm#$$& 779LV]]$=>!==@@Lj6>>3CjjqqwwL JJ{D!*	
 ""%,!"MM>>&-fk&B&"" K$     )g        NNN)typingr   r   modeling_flash_attention_utilsr   r   utilsr   
get_logger__name__r+   r;   r    ModuleTensorfloatinttupler@   r   rA   r   <module>rL      s      h  
		H	%68  #$(#DHHOOD<<D 
D <<	D
 U\\*D D e_D SMD e_D 5<<DrA   