
    rh.0                        d Z ddlmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZmZmZ  e       rddlmZ dd	lmZmZmZ  e	j(                  e      Z G d
 d      Z	 d!dej0                  dej0                  dej0                  dej0                  fdZeej0                  ef   Z	 	 	 	 	 d"dej0                  dee   deeeef      dee   ddf
dZdej0                  dedej0                  fdZ	 	 	 	 d#dej@                  jB                  dej0                  dej0                  dej0                  deej0                  df   dee"   dee"   deej0                     deej0                     deej0                  ej0                  f   fd Z#y)$a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)_torch_versionis_torch_less_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attentionc                   x     e Zd ZdZdZdZdZ fdZej                  j                  d      d        Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 \    | j                   t        | 	  |       | _         | j                   S N)	_instancesuper__new__)clsargskwargs	__class__s      {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__7   s'    == !GOC0CM}}    )	recursivec                    | j                   r|| j                  k7  r|| _        t        d      r!t        j                  t
        d      | _        nht        j                  t              j                  dk(  r$|r"t        j                  t
        dd      | _        nt        j                  t
              | _        d| _         yy)	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r    modeTN)_is_flex_compiledtrainingr
   torchcompiler   _compiled_flex_attentionr   parser	   base_version)selfr#   s     r   __init__zWrappedFlexAttention.__init__=   s    
 %%T]])B$DM%g.05nV[0\- ~.;;wF805"E8T1-
 16n0M-%)D" *Cr   c                     | j                   S r   )r&   )r)   s    r   __call__zWrappedFlexAttention.__call__S   s    ,,,r   )__name__
__module____qualname____doc__r   r"   r&   r   r$   compilerdisabler*   r,   __classcell__)r   s   @r   r   r   .   sK     I# ^^e,* -**-r   r   querykeyvaluereturnc                 X    t               s t        |             nt        } || ||fi |S r   )r   r   r   )r4   r5   r6   r#   r   flex_attention_compileds         r   compile_friendly_flex_attentionr:   W   s@     G_F`<28<>ft" 	 r   attention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 D     j                   \  }}|s|}|s|}|t        z  dz   t        z  }t        j                  j                  j                   dd||z
  f        j                  }	 j                         |4j                         j                  d      j                  d      dz
  |z   fdfd}
 fd}|s|n|n|
|0|d   j                  |	      |d   j                  |	      fd	}n}t        ||d|||	t        d
             S )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r6   padNc                 T    ||k\  }	| |f   	| |f   k(  }| |f   dkD  }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskr;   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 B    | |f   | |f   k(  } | |||      }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        rD   )rE   rF   rG   rH   
chunk_maskcausal_doc_maskrN   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s>      	5 01Z	6@Q5RR
))XufMO++r   c                 D    | |f   | |f   k(  }| |f   dkD  }||z  }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   rD   )	rE   rF   rG   rH   rJ   rK   rL   r;   rM   s	          r   default_mask_modz5make_flex_block_causal_mask.<locals>.default_mask_mod   sH    
 %Y%56,yRXGX:YY(F):;a?!M1
r   c                 .    |z   }|z   } | |||      S r   rD   )	rE   rF   rG   rH   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s(    x'H*I*9h)TTr   r   )r\   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer$   nn
functionalrA   ra   clonefill_cumsumtor   r
   )r;   r<   query_length
key_lengthr=   r>   
batch_sizetotal_seq_lenpad_lenra   rS   rU   r\   rN   rR   rM   rY   rZ   r[   s   `            @@@@@@r   make_flex_block_causal_maskrp   m   sC   D !2 7 7J"
$55:>UUG++//0AQRT[^hThPi/j%%F$**,L'"((*003::2>BH\]
,	 "25I5Q/Wl1:==(AJMM&)		U
 +

+G44	 	r   hidden_statesn_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r@   N)rc   expandreshape)rq   rr   batchnum_key_value_headsslenhead_dims         r   	repeat_kvrz      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr   moduleattention_maskscalingsoftcap	head_masks_auxc	                    t         j                  d       |	j                  dd      dkD  rt        d      d }
d t	        |t
              r|}
n|d d d d d d d |j                  d   f   fd}d}|j                  d	   }||d	z
  z  dk(  sTt        ||j                  d	   |j                  d	   z        }t        ||j                  d	   |j                  d	   z        }d
}|	j                  d      }t        |||||
|||d| j                  
      \  }}|j                  |j                        }|j                  d	d      j                         }||fS )Nzm`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature.dropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 l   t        j                  | z        z  } | |   d   |   |   z   } 	| 	|   |   d   d   z   } 
mt        j                  | dd      j                  }t        j                  
|z
        }t        j                  | |z
        }|j                  dd      |z   }||z  } | S )Nr   rB   T)dimkeepdim)r$   tanhmaxvaluesexpsum)scorerE   rF   rG   rH   
logits_maxsinksunnormalized_scores
normalizerr   r   
score_maskr~   s            r   	score_modz)flex_attention_forward.<locals>.score_mod  s    ejj99E!Jy1!4U;FCCE Ii0:1=a@@E5b$?FFJIIej01E"'))EJ,>"?,00R0FNJ'*4Er   Tr@   Fkernel_options)r   
block_mask
enable_gqascaler   
return_lser#   r   )loggerwarning_onceget
ValueError
isinstancer   rc   rz   r:   r#   rj   dtype	transpose
contiguous)r{   r4   r5   r6   r|   r}   r~   r   r   r   r   r   r   num_local_query_headsr   attn_outputattention_weightsr   s         ```        @r   flex_attention_forwardr      s    {	
 zz)S!A%a
 	

 JJ.),#
#
1a399R= 89
 J!KKN #&;a&?@QFU[[^syy|;<%Q5;;q>!AB
ZZ 01N%D% &"K" *,,U[[9''1-88:K)))r   )F)NNNNT)NNNN)$r0   typingr   r   r$   	packagingr   utilsr   r   utils.import_utilsr	   r
   r   !torch.nn.attention.flex_attentionr   rd   r   r   r   
get_loggerr-   r   r   Tensorr:   intOffsettupleboolrp   rz   re   Modulefloatr   rD   r   r   <module>r      s  8 #   9 a a  !g^^ 
		H	%&- &-Z 	<<	 << \\$ 
u||S 	! +//3 $o||o"3-o
 eFFN+,o ~o od	UU\\ 	U# 	U%,, 	U$  $#(,$(L*HHOOL*<<L* 
L* <<	L*
 %,,34L* e_L* e_L* %L* ELL!L* 5<<%&L*r   