
    rh@                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZ d	d
lmZ d	dlmZmZmZ d	dlmZmZmZmZmZmZmZ ddlmZ  ej@                  e!      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z*g dZ+y)     )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 &    t         |   ||       y Nsuper__init__selfr   r   	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr$   z"GraniteMoeHybridAttention.__init__+   s    +    __name__
__module____qualname__r   intr$   __classcell__r'   s   @r(   r   r   *   s    ,5 ,# , ,r)   r   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr   r   c                 8    t         |   t        |      |       y r!   )r#   r$   r   r%   s      r(   r$   z#GraniteMoeHybridMambaLayer.__init__0   s    V,i8r)   r*   r0   s   @r(   r2   r2   /   s    95 9# 9 9r)   r2   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedc                 &    t         |   ||       y r!   r"   )r&   hidden_sizeepsr'   s      r(   r$   z%GraniteMoeHybridRMSNormGated.__init__5   s    c*r)   )gư>)r+   r,   r-   r$   r/   r0   s   @r(   r5   r5   4   s    + +r)   r5   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr   c                 $    t         |   |       y r!   r"   r&   r   r'   s     r(   r$   zGraniteMoeHybridMLP.__init__:   s     r)   )r+   r,   r-   r   r$   r/   r0   s   @r(   r:   r:   9   s    !5 ! !r)   r:   c                   \    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     dee	   dee
   dee
   d	eej                     d
ee
   deeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )GraniteMoeHybridDecoderLayerr   r   c                    t         |   ||       t        |      | _        d | _        d | _        |j                  |   dk(  rt        ||      | _        nt        ||      | _        |j                  |   | _	        t        |dd      dkD  | _        y )Nmambanum_local_expertsr   )r#   r$   r:   
shared_mlp	self_attnr@   layers_block_typer2   r   
layer_typegetattrhas_expertsr%   s      r(   r$   z%GraniteMoeHybridDecoderLayer.__init__?   s    +-f5
##I.'93FIFDJ6vyIDN 229= #6+>BQFr)   hidden_statesattention_maskpast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	                    |}
| j                  |      }| j                   | j                  d||||d|	}d}n | j                  d|||||||d|	\  }}|
|| j                  z  z   }|}
| j	                  |      }| j
                  r)| j                  |      \  }}|| j                  |      z   }n| j                  |      }d}|
|| j                  z  z   }|f}|r||fz  }|r||fz  }|S )aB  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)rH   rM   cache_paramsrI   )rH   rI   rJ   rK   rL   rM   rO    )input_layernormr@   rC   residual_multiplierpost_attention_layernormrG   block_sparse_moerB   )r&   rH   rI   rJ   rK   rL   rM   rN   rO   rP   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                  r(   forwardz$GraniteMoeHybridDecoderLayer.forwardO   sG   J !,,];::!&DJJ +-+-	
 M !%/=t~~ 	0+--"3#-$7	0 	0,M, !=43K3K#KK !55mD/3/D/D]/S,}-0NNM OOM:M M =43K3K#KK ")++G''Gr)   )NNFFNFN)r+   r,   r-   r   r.   r$   torchTensorr   r   bool
LongTensortupler
   r   FloatTensorr^   r/   r0   s   @r(   r>   r>   >   s   G5 G# G& 26*.,1$)59/4KOU||U !.U !	U
 $D>U D>U !!1!12U 'tnU &eELL%,,,F&GHU 45U 
u  (51B1BEDUDU1U+V"WW	XUr)   r>   c                   4     e Zd ZU eed<   dgZdZ fdZ xZS )GraniteMoeHybridPreTrainedModelr   r>   Tc                    t         |   |       t        |t              r|j                  j
                  j                  d       t        j                  t        j                  d|j                  dz               |j                  _        |j                  j
                  j                  d       y t        |t              r&|j                  j
                  j                  d       y y )Ng      ?r   )r#   _init_weights
isinstancer2   dt_biasdatafill_r_   logarange	num_headsA_logDr5   weight)r&   moduler'   s     r(   rh   z-GraniteMoeHybridPreTrainedModel._init_weights   s    f%f89NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <=MM$$S) >r)   )	r+   r,   r-   r   __annotations___no_split_modules_is_statefulrh   r/   r0   s   @r(   rf   rf      s!    ""78L* *r)   rf   c                   f    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	eeeej                     f      de	ej                     de	e   d	e	e   d
e	e   de	e   de	e   de	ej                     dee   deeef   fd              Zd Z xZS )GraniteMoeHybridModelr   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w r!   )r#   r$   r   
ModuleListrangenum_hidden_layersr>   layersr%   s      r(   r$   zGraniteMoeHybridModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A	input_idsrI   position_idspast_key_valuesinputs_embedsrL   rK   output_hidden_statesrN   return_dictrM   rP   rQ   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|t        j                  d       |F||j                         nd}t        j                  |||j                  d   z   |j                         }||j#                  d      }| j%                  |||||      }| j'                  ||      }|}d }| j(                  | j)                  ||      }|rdnd }|rdnd }|	rdnd }| j*                  D ]U  }|j,                  d	k(  r|n|}|r||fz  } ||f||||||	|d
|}|d   }|r|d   	||d   fz  }|	sG|d   M||d   fz  }W | j/                  |      }|r||fz  }|r|j0                  sd|_        t3        |||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicerT   r@   )rI   rJ   rK   rL   rM   rN   rO   T)last_hidden_stater   rH   
attentionsr\   )r   rK   r   rL   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthr_   rn   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr}   rE   normhas_previous_stater	   )r&   r~   rI   r   r   r   rL   rK   r   rN   r   rM   rP   past_seen_tokenscausal_mask
mamba_maskrH   rO   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputss                           r(   r^   zGraniteMoeHybridModel.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 ,,^^L
 &"??&"&//-"N #7BD0d"6BD![[ 	>M'4'?'?7'JP[J#!m%55!)
)."3#-%9$7
 
M *!,M  #/"}Q'7&99N# $0%-*;)==%;	>> 		-0  -!11?#E#E15O.%+++%+
 	
r)   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r_   all)r&   rI   rM   r   s       r(   r   z(GraniteMoeHybridModel._update_mamba_mask4  s7     $
!q ^%?EIIn`aNaDbJr)   )NNNNNNNNNNN)r+   r,   r-   r   r$   r   r   r_   rb   r   r`   r   r   listrd   ra   r
   r   rc   r   r^   r   r/   r0   s   @r(   rx   rx      sS   
5 
  '+1537KO59$(,0/3/3&*59s
##s
 !.s
 u//0	s

 "%tE4E4E/F(F"GHs
   1 12s
 D>s
 $D>s
 'tns
 'tns
 d^s
 !!1!12s
 45s
 
u--	.s
  s
j	r)   rx   c                   >     e Zd ZdgZdef fdZ	 	 	 	 	 	 ddZ xZS )GraniteMoeHybridForCausalLMzlm_head.weightr   c                 d    t         |   |       t        |      | _        | j	                          y r!   )r#   r$   rx   model	post_initr<   s     r(   r$   z$GraniteMoeHybridForCausalLM.__init__C  s&     *62
r)   c                 N   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }ne|j                   d   |j                   d   k7  rF|d d |f   }n<|r:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  |||||d       |
S )Nr   r   r   r   r   r~   )r   r   rL   rI   rM   )
r   r   r   dtyper   longcumsummasked_fill_
contiguousupdate)r&   r~   r   rI   r   rM   r   rL   rP   empty_past_kvmodel_inputss              r(   prepare_inputs_for_generationz9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationI  sW    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 r)   )NNNNNT)r+   r,   r-   _tied_weights_keysr   r$   r   r/   r0   s   @r(   r   r   @  s2    *+5  7r)   r   )r   rx   rf   ),typingr   r   r_   r   cache_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr+   r   r   r2   r5   r:   r>   rf   rx   r   __all__rT   r)   r(   <module>r      s     #     O & > > 3 b b   C 
		H	%, 9 ,
9 9
+#4 +
!- !
f#? fR*&E *G1 GT@"= @F fr)   