
    rh(                     D   d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ d	d
lmZmZmZmZ ddlmZ  ej(                  e      Z G d ded      Z G d dej0                        Z G d de      Z G d de      Z G d de      Z G d de      Zg dZy)    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   %   s7    " ######__r#   r   F)totalc                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 `   t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        j                  | j                  | j                  dz  d      | _
        t        j                  | j                  | j                  d      | _        y )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr(   	__class__s     r$   r,   zGraniteMoeSharedMLP.__init__G   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr#   hidden_statesreturnc                     | j                  |      }|j                  dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr   )dimr   r   )r3   chunkr1   r4   )r6   r8   chunked_hidden_statess      r$   forwardzGraniteMoeSharedMLP.forwardP   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r#   )
r   r   r   r   r   r,   r   Tensorr?   __classcell__r7   s   @r$   r'   r'   >   s2    V5 VU\\ ell r#   r'   c                   |    e Zd Zdedef fdZ	 	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     dee   deeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )GraniteMoeSharedDecoderLayerr(   	layer_idxc                 t    t         |   ||       |j                  dk(  rd | _        y t        |      | _        y )Nr   )r+   r,   r/   r'   
shared_mlpr6   r(   rE   r7   s      r$   r,   z%GraniteMoeSharedDecoderLayer.__init__Y   s3    +"("A"AQ"F$L_`fLgr#   r8   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr9   c
                 l   |}| j                  |      } | j                  d||||||||	d|
\  }}||| j                  z  z   }|}| j                  |      }| j	                  |      \  }}| j
                  |}n|| j                  |      z   }~||| j                  z  z   }|f}|r||fz  }|r||fz  }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )r8   rI   rJ   rK   rL   rM   rN   rP   r"   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerG   )r6   r8   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                   r$   r?   z$GraniteMoeSharedDecoderLayer.forward]   s   L !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !=43K3K#KK !55mD+/+@+@+O(=??"-M-0NNM =43K3K#KK ")++G''Gr#   )NNNFFNFN)r   r   r   r   r    r,   r   r@   r   r   r   booltupler	   r   FloatTensorr?   rA   rB   s   @r$   rD   rD   X   s,   h5 h# h 2637*.,1$)59/4KOO||O !.O u//0	O
 !O $D>O D>O !!1!12O 'tnO &eELL%,,,F&GHO 45O 
u  (51B1BEDUDU1U+V"WW	XOr#   rD   c                       e Zd ZU eed<   dgZy)GraniteMoeSharedPreTrainedModelr(   rD   N)r   r   r   r   r   _no_split_modulesr"   r#   r$   ra   ra      s    ""78r#   ra   c                   $     e Zd Zdef fdZ xZS )GraniteMoeSharedModelr(   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r+   r,   r   
ModuleListrangenum_hidden_layersrD   layersrH   s      r$   r,   zGraniteMoeSharedModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A)r   r   r   r   r,   rA   rB   s   @r$   rd   rd      s    
5 
 
r#   rd   c                   *     e Zd ZdgZdef fdZ xZS )GraniteMoeSharedForCausalLMzlm_head.weightr(   c                 d    t         |   |       t        |      | _        | j	                          y rf   )r+   r,   rd   model	post_initr5   s     r$   r,   z$GraniteMoeSharedForCausalLM.__init__   s&     *62
r#   )r   r   r   _tied_weights_keysr   r,   rA   rB   s   @r$   rl   rl      s    *+5  r#   rl   )rl   rd   ra   )typingr   r   r   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler'   rD   ra   rd   rl   __all__r"   r#   r$   <module>r|      s     '   !   &   C 
		H	%)5 2")) 4T#9 Tn9&? 9

O 
"7  fr#   