
    rh.                     :   d dl mZmZ d dlZd dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZ ddlmZ  ej6                  e      Z G d de      Z G d de      Z G d de      Z  G d de      Z! G d de      Z"g dZ#y)    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   4     e Zd ZdZddedee   f fdZ xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 H    t         |   ||       |j                  | _        y N)super__init__attention_multiplierscalingselfr   r   	__class__s      ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/granite/modular_granite.pyr   zGraniteAttention.__init__+   s    +22    r   )	__name__
__module____qualname____doc__r   r   intr   __classcell__r#   s   @r$   r   r   (   s"    G3} 3# 3 3r%   r   c                   f    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )GraniteDecoderLayerr   r   c                 l    t         |   ||       |j                  | _        t        ||      | _        y )N)r   r   )r   r   residual_multiplierr   	self_attnr!   s      r$   r   zGraniteDecoderLayer.__init__1   s.    +#)#=#= )9Mr%   hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    |}
| j                  |      } | j                  d||||||||d|	\  }}|
|| j                  z  z   }|}
| j                  |      }| j	                  |      }|
|| j                  z  z   }|f}|r||fz  }|S )a.  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r2   r3   r4   r5   r6   r7   r8   r9    )input_layernormr1   r0   post_attention_layernormmlp)r"   r2   r3   r4   r5   r6   r7   r8   r9   kwargsresidualself_attn_weightsoutputss                r$   forwardzGraniteDecoderLayer.forward6   s    D !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !=43K3K#KK !55mD/ =43K3K#KK ")++Gr%   )NNNFFNN)r&   r'   r(   r   r*   r   torchTensorr   
LongTensorr   booltupleFloatTensorrD   r+   r,   s   @r$   r.   r.   0   s    N} N N 2637*.,1$)59KO?||? !.? u//0	?
 !? $D>? D>? !!1!12? &eELL%,,,F&GH? 
u  (51B1BEDUDU1U+V"WW	X?r%   r.   c                       e Zd Zy)GranitePreTrainedModelN)r&   r'   r(   r<   r%   r$   rL   rL   x   s    r%   rL   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )GraniteModelr   c           	          t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r   )	r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layersr.   layersr!   s      r$   r   zGraniteModel.__init__}   sR     $*$?$?!mmEJ6KcKcEde	 3e
es   A(	input_idsr3   r4   past_key_valuesinputs_embedsr7   r6   output_hidden_statesr8   r@   r:   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|
t               }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }t%        | j                   |||	||      }|}| j'                  ||      }|rdnd }|rdnd }| j(                  d | j                   j*                   D ],  }|r||fz  } ||f||||||	|d	|
}|d   }|s$||d   fz  }. | j-                  |      }|r||fz  }t/        ||r|nd ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )device)r   input_embedsr3   r8   rV   r4   r<   )r3   r4   r5   r6   r7   r8   r9   )last_hidden_staterV   r2   
attentions)r   r6   rX   r7   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrP   r   get_seq_lengthrE   arangeshaperZ   	unsqueezer	   
rotary_embrT   rS   normr
   )r"   rU   r3   r4   rV   rW   r7   r6   rX   r8   r@   past_seen_tokenscausal_maskr2   r9   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r$   rD   zGraniteModel.forward   sG    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oom\J #7BD0d![[)H4;;+H+HI 	6M#!m%55!)
*)."3#-$7
 
M *!,M =#3"55'	6* 		-0  -!11&+/8Od+%	
 	
r%   )	NNNNNNNNN)r&   r'   r(   r   r   r   rE   rG   rF   r   rJ   rH   r   r   r
   rD   r+   r,   s   @r$   rN   rN   |   s    
} 
 151537+/59$(,0/359_
E,,-_
 !._
 u//0	_

 "%_
   1 12_
 D>_
 $D>_
 'tn_
 !!1!12_
 +,_
 
!_
r%   rN   c                   \   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddeej
                     deej                     deej
                     deeee	ej                     f      deej                     deej
                     dee   d	ee   d
ee   deej
                     deeej                  f   dee   defdZy)GraniteForCausalLMNrU   r3   r4   rV   rW   labelsr7   r6   rX   r8   logits_to_keepr@   r:   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d       n|}| j                  |d d |d d f         }|| j                   j                  z  }d }|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )N)	rU   r3   r4   rV   rW   r7   r6   rX   r8   )logitsrr   
vocab_size)lossru   rV   r2   r]   r<   )r   r6   rX   modelr\   
isinstancer*   slicelm_headlogits_scalingloss_functionrv   r   rV   r2   r]   )r"   rU   r3   r4   rV   rW   rr   r7   r6   rX   r8   rs   r@   rC   r2   slice_indicesru   rw   s                     r$   rD   zGraniteForCausalLM.forward   s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$++444%4%%pVFt{{OeOepiopD%#33!//))
 	
r%   )NNNNNNNNNNr   )r&   r'   r(   r   rE   rG   rF   r   r   listrJ   rH   r*   r   r   r   rD   r<   r%   r$   rq   rq      s$    151537KO59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 "%tE4E4E/F(F"GH2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 +,2
 
 2
r%   rq   )rq   rN   rL   )$typingr   r   rE   torch.utils.checkpointr   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr&   ra   r   r.   rL   rN   rq   __all__r<   r%   r$   <module>r      s     #    . / O & 0  1 
		H	%3~ 3E+ EP	1 	g
: g
T3
) 3
l Kr%   