
    rhAk                     Z   d Z ddlmZ ddlZddlmc mZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+  ejX                  e-      Z. G d de       Z/ G d de*      Z0 G d de      Z1 G d dejd                        Z3 G d de"      Z4 G d de+      Z5 G d d e#e      Z6 G d! d"e)      Z7 G d# d$e(      Z8 G d% d&e$      Z9 G d' d(e&      Z: G d) d*e'      Z; G d+ d,e%      Z<g d-Z=y).zPyTorch MiniMax model.    )OptionalN)nn   )ACT2FN)CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)Unpack)TransformersKwargslogging)OutputRecorder   )MixtralConfig)
MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockc                   4     e Zd ZdZ	 	 	 	 	 	 	 	 d fd	Z xZS )MiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```c	                 V   t        |   di |	 || _        || _        || _        || _        || _        || _        || _        || _	        | j                  ;t        | j                        D 
cg c]  }
t        |
dz   dz        rdnd c}
| _        t        | j                         y c c}
w )N   r   full_attentionlinear_attention )super__init__layer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorrangenum_hidden_layersboolr	   )selfr(   r)   r*   r+   r,   r-   r.   r/   super_kwargsi	__class__s              ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/minimax/modular_minimax.pyr'   zMiniMaxConfig.__init__   s     	(<(&$&<#%:"(@%'>$ 0.#W\]a]s]sWt RSD!a%1$5 ;MM D 	d../ s   ,B&)N   r"   r"   r"   r"   r"   r"   )__name__
__module____qualname____doc__r'   __classcell__r6   s   @r7   r    r    4   s.    cN  !" !0 0    r    c                       e Zd Zy)MiniMaxRMSNormNr9   r:   r;   r%   r?   r7   rA   rA          r?   rA   c                        e Zd Z fdZd ZdefdZ fdZdef fdZd Z	defd	Z
d
ej                  fdZdefdZ xZS )MiniMaxCachec                 0    t         |           g | _        y N)r&   r'   linear_cacher3   r6   s    r7   r'   zMiniMaxCache.__init__   s    02r?   c                     t        t        | j                        |dz         D ]  }| j                  j                  g         || j                  |<   y )Nr"   )r0   lenrH   append)r3   	layer_idxrH   _s       r7   set_linear_cachezMiniMaxCache.set_linear_cache   sK    s4,,-y1}= 	)A$$R(	)'3)$r?   rM   c                 >    |t        |       k  r| j                  |   S y rG   )rK   rH   r3   rM   s     r7   get_linear_cachezMiniMaxCache.get_linear_cache   s"    s4y $$Y//r?   c                 Z    t        t        | 	         t        | j                              S rG   )maxr&   __len__rK   rH   rI   s    r7   rU   zMiniMaxCache.__len__   s"    57?$c$*;*;&<==r?   c                     |t        | j                        k  r"| j                  |   g k7  r| j                  |   fS t        |   |      S rG   )rK   rH   r&   __getitem__)r3   rM   r6   s     r7   rW   zMiniMaxCache.__getitem__   sM    s4,,--$2C2CI2NRT2T%%i022w"9--r?   c              #   L   K   t        t        |             D ]	  }| |     y wrG   )r0   rK   rQ   s     r7   __iter__zMiniMaxCache.__iter__   s(     s4y) 	"Iy/!	"s   "$repeatsc                     t        t        |             D ]`  }| j                  |   g k7  r.| j                  |   j                  |d      | j                  |<   C| j                  |   j                  |       b y )Nr   dim)r0   rK   rH   repeat_interleavelayersbatch_repeat_interleave)r3   rZ   rM   s      r7   r`   z$MiniMaxCache.batch_repeat_interleave   ss    s4y) 	HI  +r1/3/@/@/K/]/]^ekl/]/m!!),I&>>wG		Hr?   indicesc                     t        t        |             D ]T  }| j                  |   g k7  r"| j                  |   |df   | j                  |<   7| j                  |   j	                  |       V y )N.)r0   rK   rH   r_   batch_select_indices)r3   ra   rM   s      r7   rc   z!MiniMaxCache.batch_select_indices   sk    s4y) 	EI  +r1/3/@/@/KGUXL/Y!!),I&;;GD		Er?   
max_lengthc                     t        d      )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r3   rd   s     r7   cropzMiniMaxCache.crop   s    GHHr?   )r9   r:   r;   r'   rO   intrR   rU   rW   rY   r`   torchTensorrc   rg   r=   r>   s   @r7   rE   rE      s]    34# 
>.S .
"Hs HEELL EIs Ir?   rE   c                   >    e Zd Zdedef fdZd Zd Z	 	 ddej                  de
ej                  ej                  f   deej                     d	ee   d
eej                     dee   de
ej                  eej                     ee
ej                        f   fdZ xZS )MiniMaxLightningAttentionconfigrM   c                    t         |           || _        t        |dd       xs |j                  |j
                  z  | _        |j
                  | _        |j                  | _        |j                  | _        t        |j                     | _        t        | j                  | j
                  z        | _        t        j                  |j                  | j
                  | j                  z  dz  d      | _        t        j                  | j
                  | j                  z  |j                  d      | _        t        j                  |j                  | j
                  | j                  z  d      | _        | j'                         }| j)                  |      \  }}}| j+                  d|       | j+                  d|       | j+                  d|       | j+                  d|       y )	Nhead_dimr   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)r&   r'   rM   getattrhidden_sizenum_attention_headsro   r1   r)   r   
hidden_actact_fnrA   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r3   rm   rM   rq   rr   rs   rt   r6   s          r7   r'   z"MiniMaxLightningAttention.__init__   s   "
D9mV=O=OSYSmSm=m#)#=#= !'!9!9 ++V../"4==43K3K#KL			&"4"4d6N6NQUQ^Q^6^ab6bino		$":":T]]"JFL^L^ejk99V%7%79Q9QTXTaTa9ahmn((*
151C1CJ1O.Y\:6]K8[)4-~>r?   c                     ddd| j                   z  z  z  }t        j                  | j                         dz   }d| j                  | j                  dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f   }|S )Nr"   r      gh㈵>)rw   ri   arangerM   r1   )r3   baseexponentfactorrates        r7   r   z(MiniMaxLightningAttention.get_slope_rate   s    A!d66678<< 8 89A=T^^t'='='AD'HIIDPX~f}AtTM"r?   c                    t        j                  | j                        dz   }t        j                  | |d d d f   z        }t        j                  | | j                  |d d d f   z
  z        }|d d d f   |d d d f   z
  }|d d d d d d f   }||z  }t        j                  |dk\  | t        d            }t        j                  |      }|||fS )Nr"   r   z-inf)ri   r   r)   expwherefloat)r3   rq   block_size_rangerr   rs   rt   s         r7   r   z'MiniMaxLightningAttention.decay_factors  s     <<81<ii.>q$w.G GHIIzkT__?OPQSWPW?X-XYZ	)!T'25EdAg5NN'dAq(89#n4^q%8>/5QW=Y>2I~55r?   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    |j                   \  }}}	|| j                  z   dz
  | j                  z  }
| j                  | j                  |            }|j	                  ||| j
                  d| j                  z        }t        j                  || j                  d      \  }}}|j                  dd      }|j                  dd      }|j                  dd      }d }||j                  | j                        }|t        j                  || j
                  | j                  | j                        j                  |      }|Q|j                  t        j                        }|j                  |j!                  d      j!                  d       d      }g }t#        |
      D ]b  }|| j                  z  }t%        || j                  z   |      }||z
  }|d d d d ||f   }|d d d d ||f   }|d d d d ||f   }| j&                  d d d |f   }| j(                  d d | d f   }| j*                  d d d d d |d |f   }t        j,                  | j.                   |z        }t        j0                  ||j                  dd            }t        j0                  ||z  |      }t        j0                  ||z  |      }||z   }|j3                  |       t        j0                  ||z  j                  dd      |      } ||z  | z   }e nt        j,                  | j.                         }!g }t#        |      D ]  }|d d d d ||dz   f   }|d d d d ||dz   f   }|d d d d ||dz   f   }t        j0                  |j                  dd      |      }"|!|z  |"z   }t        j0                  ||      }|j3                  |        t        j4                  |d      }|j                  dd      }|j	                  ||| j
                  | j                  z        }| j7                  |      }t9        j:                  | j=                  |            |z  }| j?                  |      }||jA                  | j                  |       ||fS )	Nr"   r   r\   r   )dtyper   )!shaper)   ry   r|   reshaperw   ro   ri   split	transposerR   rM   zerostor2   masked_fill	unsqueezer0   minrr   rs   rt   r   rq   matmulrL   catrz   Fsigmoidr~   r}   rO   )#r3   r   r   r   r   r   r   
batch_sizeseq_lenrv   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputr5   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters#                                      r7   forwardz!MiniMaxLightningAttention.forward  s    ,9+>+>(
G[/!3G
[[}!=>
''
GT=U=UWX[_[h[hWhi
16Z\]1^.j,#--a3))!Q/
#--a3 "%!/!@!@!P%!&Z9Q9QSWS`S`bfbobo!p!s!s"
 )!/!2!2!2!D+779Q9QRS9T9^9^_a9b8bdefK:& `/	i$//97C%,y%8"'3Aq)G:K4K'L$%/1i6G0G%H"'3Aq)G:K4K'L$&*&6&6q:M;M:M7M&N#$(NN17I6I6J3J$K!)-)<)<QCVDVCVXkYkXk=k)l&#ii(8;M(MN &+\\2FHZHdHdegikHl%m"$)LL1CF\1\^r$s! %*LL1EH[1[]o$p! '8:K&K#""#67 +0,,'*;;FFr2NPd+' &8+%EH_%_";`@ IIt./EK7^ 	8'3Aq!a!e)O'D$%/1a!a%i%@"'3Aq!a!e)O'D$-2\\:L:V:VWY[]:^`t-u*%*-?%?B\%\"&+ll3GI[&\#""#67	8 ii4 "++Aq1!))*gt?W?WZ^ZgZg?ghii,ii 0 0 ?@;NmmK0 %++DNN<NO...r?   )NN)r9   r:   r;   r    rh   r'   r   r   ri   rj   tupler   r   
LongTensorr   r   r   r=   r>   s   @r7   rl   rl      s    ?} ? ?,	6& +/59`/||`/ #5<<#=>`/ !.	`/
 !`/ !!1!12`/ -.`/ 
u||Xell3XeELL>Q5RR	S`/r?   rl   c                       e Zd Zy)MiniMaxAttentionNrB   r%   r?   r7   r   r   |  rC   r?   r   c                       e Zd Zy)MiniMaxSparseMoeBlockNrB   r%   r?   r7   r   r     rC   r?   r   c                       e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	ej                     de	eej                        d	e	e   d
e	e   de	e   de	ej                     dee   deej                  e	eej                  ej                  f      f   fdZ xZS )MiniMaxDecoderLayerrm   rM   c                    t         |   ||       || _        |j                  |   | _        |j
                  | _        |j                  | _        | j                  dk(  r4t        ||      | _        |j                  | _
        |j                  | _        y t        ||      | _        |j                  | _
        |j                  | _        y )Nr$   )r&   r'   rM   r(   
layer_typer.   r/   rl   	self_attnr,   attn_alpha_factorr-   attn_beta_factorr   r*   r+   )r3   rm   rM   r6   s      r7   r'   zMiniMaxDecoderLayer.__init__  s    +" ,,Y7 & 7 7%55??006vyIDN%+%D%DD"$*$B$BD!-fi@DN%+%B%BD"$*$@$@D!r?   r   r   r   position_idsr   output_attentionsoutput_router_logits	use_cacher   r   r   c
                 2   | j                  |      }|} | j                  d||||||||	d|
\  }}|| j                  z  || j                  z  z   }| j	                  |      }|}| j                  |      \  }}|| j                  z  || j                  z  z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r   r   r   r   r   r   r   r   r%   )input_layernormr   r   r   post_attention_layernormblock_sparse_moer.   r/   )r3   r   r   r   r   r   r   r   r   r   r   residualrN   s                r7   r   zMiniMaxDecoderLayer.forward  s    L ,,];  *4>> 

' 3)%)/)

 

q !4#9#99MDLaLa<aa 55mD 00?q 4#8#88=4K_K_;__r?   )NNNFFFN)r9   r:   r;   r    rh   r'   ri   rj   r   r   r   r2   r   r   FloatTensorr   r=   r>   s   @r7   r   r     s!   A} A A* 26378<,1/4$)59=||= #5<<#=>= !.	=
 u//0= !u||!45= $D>= 'tn= D>= !!1!12= -.= 
u  (51B1BEDUDU1U+V"WW	X=r?   r   c                   0    e Zd ZdZ eed      eeegdZ	y)MiniMaxPreTrainedModelFr"   )index)router_logitsr   
attentionsN)
r9   r:   r;   _can_compile_fullgraphr   r   r   r   rl   _can_record_outputsr%   r?   r7   r   r     s'    "'(=QG,')BCr?   r   c                       e Zd Z	 	 	 	 	 	 	 	 ddej                  deej                     deej                     dee   deej                     dee	   dee	   d	eej                     d
e
e   defdZy)MiniMaxModelN	input_idsr   r   past_key_valuesinputs_embedsr   r   r   r   r   c	                    |d u |d uz  rt        d      |r|t               }n*|r(t        |t              st        dt        |       d      || j	                  |      }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        } || j                  |||||      }|}| j                  ||      }| j                   D ]&  }|j"                  dk(  r|}n|} ||f||||||d	|	}( | j%                  |      }t'        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r"   )device)rm   input_embedsr   r   r   r   r#   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorrE   
isinstancetypeembed_tokensget_seq_lengthri   r   r   r   r   rm   sliding_windowr
   r   
rotary_embr_   r   rz   r   )r3   r   r   r   r   r   r   r   r   r   past_seen_tokensmask_functioncausal_maskr   r   decoder_layerinput_attention_masks                    r7   r   zMiniMaxModel.forward  s    -t";<YZZ0*nOz/<Hefjkzf{e||}~    --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oom\J![[ 	M''+;;'2$ (6$)	$73).#-	 	M	$ 		-0%++
 	
r?   )NNNNNNNN)r9   r:   r;   ri   r   r   rj   rE   r   r2   r   r   r   r   r%   r?   r7   r   r     s     '+15372659$(,059G
##G
 !.G
 u//0	G

 ",/G
   1 12G
 D>G
 $D>G
 !!1!12G
 +,G
 
 G
r?   r   c                        e Zd Z fdZ xZS )MiniMaxForCausalLMc                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r%   )r&   r   )r3   r4   r6   s     r7   r   zMiniMaxForCausalLM.forward+  s    . w...r?   )r9   r:   r;   r   r=   r>   s   @r7   r   r   *  s    / /r?   r   c                       e Zd Zy) MiniMaxForSequenceClassificationNrB   r%   r?   r7   r   r   E  rC   r?   r   c                       e Zd Zy)MiniMaxForTokenClassificationNrB   r%   r?   r7   r   r   I  rC   r?   r   c                       e Zd Zy)MiniMaxForQuestionAnsweringNrB   r%   r?   r7   r   r   M  rC   r?   r   )r    r   r   r   r   r   r   )>r<   typingr   ri   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr   r   configuration_utilsr	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   processing_utilsr   utilsr   r   utils.genericr   mixtral.configuration_mixtralr   mixtral.modeling_mixtralr   r   r   r   r   r   r   r   r   r   
get_loggerr9   loggerr    rA   rE   Modulerl   r   r   r   r   r   r   r   r   r   __all__r%   r?   r7   <module>r     s          ! . 8 R B 9 6 & 0 + 9   
		H	%@0M @0F	^ 	+I< +I\P/		 P/f	' 		1 	O-/I Od3 H
< H
V/+ /6	'G 		$A 		"= 	r?   