
    rhO                        d dl Z d dlmZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZ ddlmZ ddlmZ  ej8                  e      ZdZdZ  G d de      Z!d Z" G d dejF                        Z$ G d de$      Z% G d de$      Z&e$e%e&dZ' G d de      Z( G d de      Z) G d de      Z* G d  d!e      Z+ G d" d#e      Z, G d$ d%e      Z- G d& d'e      Z.g d(Z/y))    N)Optional)nn   )CacheStaticCache)_flash_attention_forward!flash_attn_supports_top_left_mask)logging   )GemmaForCausalLM)LlamaDecoderLayerLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelapply_rotary_pos_emb	repeat_kv)
MistralMLP   )DiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                       e Zd Zy)DiffLlamaMLPN__name__
__module____qualname__     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.pyr   r   0       r   r   c                 >    ddt        j                  d| z        z  z
  S )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r    lambda_init_fnr&   4   s     txxy 01111r   c                   \    e Zd ZdZddedee   f fdZ	 	 	 	 	 ddej                  de
ej                  ej                  f   deej                     deej                     d	ee   d
edeej                     de
ej                  eej                     ee
ej                        f   fdZ xZS )DiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfigr%   c                    t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _	        |j                  | _        t        |d| j                  | j                  z        | _        |j                  | _        | j                  | j                  z  | _        |j                   | _        |j"                  | _        d| _        t'        j(                  | j                  | j                  | j                  z  |j*                        | _        t'        j(                  | j                  | j                  | j                  z  |j*                        | _        t'        j(                  | j                  | j                  | j                  z  |j*                        | _        t'        j(                  | j                  | j                  z  | j                  |j*                        | _        t5        |      | _        t'        j8                  t;        j<                  d|j>                  | j                  f            | _         t'        j8                  t;        j<                  d|j>                  | j                  f            | _!        t'        j8                  t;        j<                  d|j>                  | j                  f            | _"        t'        j8                  t;        j<                  d|j>                  | j                  f            | _#        t'        jH                  d| j                  z  |jJ                  d	
      | _&        y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.head_dimT)biasr   )sizer   F)epselementwise_affine)'super__init__r)   r%   loggerwarning_once	__class__r   attention_dropouthidden_sizenum_attention_heads	num_headsgetattrr+   num_key_value_headsnum_key_value_groupsmax_position_embeddings
rope_theta	is_causalr   Linearattention_biasq_projk_projv_projo_projr&   lambda_init	Parametertorchnormallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormselfr)   r%   r4   s      r    r1   zDiffLlamaAttention.__init__;   s~   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ ++ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir   hidden_statesposition_embeddingsattention_maskposition_idspast_key_value	use_cachecache_positionreturnc                    |j                         \  }	}
}|
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|| j
                  | j                        j                  dd      }|j	                  |	|| j                  | j                        j                  dd      }|j	                  |	|| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }t        j$                  ||j                  dd            t'        j(                  | j                        z  }|#|d d d d d d d |j*                  d   f   }||z   }t,        j.                  j1                  |dt        j2                        j5                  |j6                        }t,        j.                  j9                  || j:                  | j<                  	      }t        j>                  t        j@                  | jB                  | jD                  z  dt        j2                              j5                  |j6                        }t        j>                  t        j@                  | jF                  | jH                  z  dt        j2                              j5                  |j6                        }||z
  | jJ                  z   }t        j$                  ||      }t        j                   |dd      \  }}|||z  z
  }d| jJ                  z
  | jM                  |      z  }|j                  dd      jO                         }|jQ                  |	|d      }| jS                  |      }||fS )
Nr   r   sincosrY   dimr   r`   dtype)ptraining)*r-   rA   rB   rC   viewr8   r+   	transposer:   r   updater%   r   r;   rG   catchunkrepeatmatmulr#   sqrtshaper   
functionalsoftmaxfloat32tord   dropoutr5   rf   r$   sumrJ   rK   rL   rM   rE   rP   
contiguousreshaperD   )rR   rS   rT   rU   rV   rW   rX   rY   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statesr^   r]   cache_kwargsattn_weightscausal_masklambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                              r    forwardzDiffLlamaAttention.forward]   s|    +//1Z{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6||L*2F2Fq!2LMPTPYPYZ^ZgZgPhh%(Aq2HJ4D4DR4H2H)HIK'+5L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#ub9kk+.L((r   NNNNFN)r   r   r   __doc__r   r   intr1   rG   Tensortuple
LongTensorr   boolr   __classcell__r4   s   @r    r(   r(   8   s    G j  j8C=  jL 2637*.59<)||<) #5<<#=><) !.	<)
 u//0<) !<) <) !!1!12<) 
u||Xell3XeELL>Q5RR	S<)r   r(   c                   J    e Zd ZdZ fdZ	 	 	 	 	 ddej                  deej                  ej                  f   deej                     deej                     dee
   ded	eej                     d
eej                  eej                     eeej                        f   fdZ xZS )DiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r   )r0   r1   r	   _flash_attn_uses_top_left_mask)rR   argsrx   r4   s      r    r1   z!DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r   rS   rT   rU   rV   rW   rX   rY   rZ   c                 @
   t        |t              rt        d      |j                         \  }}	}
| j	                  |      }| j                  |      }| j                  |      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|+t        j                  d       | j                  ||      \  }}n|\  }}t        ||||      \  }}|'|||d}|j!                  ||| j"                  |      \  }}|j                  dd      }|j                  dd      }|j                  dd      }| j$                  r| j&                  nd}|j(                  }|j*                  j,                  dk7  r|j*                  j,                  nd}|t.        j0                  k(  rt/        j2                         r:t5        t.        d	      rt/        j6                  |      nt/        j8                         }nMt5        | j:                  d
      r| j:                  j<                  }n | j                  j>                  j(                  }t        j                  d| d       |jA                  |      }|jA                  |      }|jA                  |      }t/        jB                  |dd      \  }}|jE                  dddd      }|jE                  dddd      }tG        |||||	||tI        | dd       | jJ                  | jL                  
      }tG        |||||	||tI        | dd       | jJ                  | jL                  
      }t/        jN                  ||gd      }t/        jB                  |dd      \  }}t/        jP                  t/        jR                  | jT                  | jV                  z  dt.        j0                              jA                  |j(                        }t/        jP                  t/        jR                  | jX                  | jZ                  z  dt.        j0                              jA                  |j(                        }||z
  | j\                  z   }|||z  z
  }d| j\                  z
  | j_                  |      z  }|ja                  ||	d      jc                         }| je                  |      }|d fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r   aY  The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.r\           mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r_   sliding_window)rV   rt   r   use_top_left_maskr>   ra   rc   )3
isinstancer   
ValueErrorr-   rA   rB   rC   rg   r8   r+   rh   r:   r2   r3   
rotary_embr   ri   r%   rf   r5   rd   devicetyperG   rr   is_autocast_enabledhasattrr   get_autocast_gpu_dtyper)   r   weightrs   rk   rl   r   r9   r   r>   rj   r$   ru   rJ   rK   rL   rM   rE   rP   rw   rv   rD   )rR   rS   rT   rU   rV   rW   rX   rY   ry   r|   r{   r}   r~   r   r^   r]   r   dropout_rateinput_dtypedevice_typetarget_dtypevalue_states1value_states2r   r   r   r   r   r   s                                r    r   z DiffLlamaFlashAttention2.forward   s    nk2} 
 &**,UA{{=1[[/
{{=1
 $((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&G |\BHC*HC#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((* u&:; ,,[9557  &?@#{{BB#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#ub9DDFkk+.D  r   r   )r   r   r   r   r1   rG   r   r   r   r   r   r   r   r   r   s   @r    r   r      s    R 6:37*.59B!||B! #5<<#=>B! !!1!12	B!
 u//0B! !B! B! !!1!12B! 
u||Xell3XeELL>Q5RR	SB!r   r   c                   8   e Zd ZdZ	 	 	 	 	 ddej
                  deej
                  ej
                  f   deej
                     deej                     dee	   de
d	eej                     d
eej
                  eej
                     eeej
                        f   fdZy)DiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    NrS   rT   rU   rV   rW   rX   rY   rZ   c                    |j                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|
| j
                  | j                        j                  dd      }|j	                  |	|
| j                  | j                        j                  dd      }|j	                  |	|
| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }|}||d d d d d d d |j$                  d   f   }|j&                  j(                  dk(  r2|0|j+                         }|j+                         }|j+                         }|d u xr |
dkD  }t        j,                  j.                  j1                  ||||| j2                  r| j4                  nd|	      }t        j                   |dd      \  }}t        j6                  t        j8                  | j:                  | j<                  z  dt        j>                  
            jA                  |jB                        }t        j6                  t        j8                  | jD                  | jF                  z  dt        j>                  
            jA                  |jB                        }||z
  | jH                  z   }|||z  z
  }d| jH                  z
  | jK                  |      z  }|j                  dd      j+                         }|j	                  |	|
d      }| jM                  |      }|d fS )Nr   r   r\   r_   ra   rb   cudar   )	attn_mask	dropout_pr>   rc   )'r-   rA   rB   rC   rg   r8   r+   rh   r:   r   ri   r%   r   r;   rG   rj   rk   rl   ro   r   r   rv   r   rp   scaled_dot_product_attentionrf   r5   r$   ru   rJ   rK   rr   rs   rd   rL   rM   rE   rP   rD   )rR   rS   rT   rU   rV   rW   rX   rY   rx   ry   r|   r{   r}   r~   r   r^   r]   r   r   r>   r   r   r   r   r   r   s                             r    r   zDiffLlamaSdpaAttention.forward8  sm    &**,UA{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK ##v-+2I'224L#..0J'224L  4'5EAI	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&sE26kk+.D  r   r   )r   r   r   r   rG   r   r   r   r   r   r   r   r   r   r    r   r   0  s     2637*.59I!||I! #5<<#=>I! !.	I!
 u//0I! !I! I! !!1!12I! 
u||Xell3XeELL>Q5RR	SI!r   r   )eagerflash_attention_2sdpac                   (     e Zd Zdedef fdZ xZS )DiffLlamaDecoderLayerr)   r%   c                 d    t         |   ||       t        |j                     ||      | _        y )N)r)   r%   )r0   r1   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnrQ   s      r    r1   zDiffLlamaDecoderLayer.__init__  s-    +4V5P5PQY_ktur   )r   r   r   r   r   r1   r   r   s   @r    r   r     s    v v3 v vr   r   c                       e Zd ZdZdZd Zy)DiffLlamaPreTrainedModelFc                     t        j                  |       t        |t              r|j                  j
                  j                  d| j                  j                         |j                  j
                  j                  d| j                  j                         |j                  j
                  j                  d| j                  j                         |j                  j
                  j                  d| j                  j                         y y )Nr   )r   _init_weightsr   r(   rJ   datanormal_r)   rI   rK   rL   rM   )rR   modules     r    r   z&DiffLlamaPreTrainedModel._init_weights  s    **62f01!!))!T[[-G-GH!!))!T[[-G-GH!!))!T[[-G-GH!!))!T[[-G-GH	 2r   N)r   r   r   _supports_flex_attn_supports_attention_backendr   r   r   r    r   r     s    "'Ir   r   c                       e Zd Zy)DiffLlamaModelNr   r   r   r    r   r     r!   r   r   c                       e Zd Zy)DiffLlamaForCausalLMNr   r   r   r    r   r     r!   r   r   c                       e Zd Zy)"DiffLlamaForSequenceClassificationNr   r   r   r    r   r     r!   r   r   c                       e Zd Zy)DiffLlamaForQuestionAnsweringNr   r   r   r    r   r     r!   r   r   c                       e Zd Zy)DiffLlamaForTokenClassificationNr   r   r   r    r   r     r!   r   r   )r   r   r   r   r   r   )0r#   typingr   rG   r   cache_utilsr   r   modeling_flash_attention_utilsr   r	   utilsr
   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_diffllamar   
get_loggerr   r2   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r&   Moduler(   r   r   r   r   r   r   r   r   r   r   __all__r   r   r    <module>r      s  $     - i  3	 	 	 2 4 
		H	%5 #	: 	2a) a)HQ!1 Q!hQ!/ Q!j  1" v- v
I3 
I	Z 		+ 		)G 		$= 		&A 	r   