
    rh                        d dl mZmZmZ d dlZd dlmc mZ d dlmZ d dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0  ed       G d dejb                               Z2 G d de      Z3 G d dejb                        Z4d Z5dEdZ6dejn                  de8d ejn                  fd!Z9	 dFd"ejb                  d#ejn                  d$ejn                  d%ejn                  d&eejn                     d'e:d(e:d)e(e*   fd*Z; G d+ d,ejb                        Z< G d- d.ejb                        Z= G d/ d0ejb                        Z> G d1 d2e      Z?e+ G d3 d4e&             Z@ G d5 d6ejb                        ZAe+ G d7 d8e@             ZB	 	 	 dGd9eejn                  eCejn                     df   d:ee8   d&eejn                     d eejn                  e8f   fd;ZDe+ G d< d=e@e             ZE G d> d?ee@      ZF G d@ dAee@      ZG G dB dCee@      ZHg dDZIy)H    )CallableOptionalUnionN)nn)check_model_inputs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)OutputRecorder   )MiniMaxConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )MiniMaxRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z=
        MiniMaxRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/minimax/modeling_minimax.pyr'   zMiniMaxRMSNorm.__init__6   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   hidden_statesinput_dtypevariances       r1   forwardzMiniMaxRMSNorm.forward>   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler+   shaper,   r-   s    r1   
extra_reprzMiniMaxRMSNorm.extra_reprE   s*    ))*+6$2G2G1HIIr2   )gư>)__name__
__module____qualname__r'   r@   rE   __classcell__r0   s   @r1   r$   r$   4   s    $;Jr2   r$   c                        e Zd Z fdZd ZdefdZ fdZdef fdZd Z	defd	Z
d
ej                  fdZdefdZ xZS )MiniMaxCachec                 0    t         |           g | _        y N)r&   r'   linear_cacher-   r0   s    r1   r'   zMiniMaxCache.__init__J   s    02r2   c                     t        t        | j                        |dz         D ]  }| j                  j                  g         || j                  |<   y Nr    )rangelenrO   append)r-   	layer_idxrO   _s       r1   set_linear_cachezMiniMaxCache.set_linear_cacheN   sK    s4,,-y1}= 	)A$$R(	)'3)$r2   rV   c                 >    |t        |       k  r| j                  |   S y rN   )rT   rO   r-   rV   s     r1   get_linear_cachezMiniMaxCache.get_linear_cacheT   s"    s4y $$Y//r2   c                 Z    t        t        | 	         t        | j                              S rN   )maxr&   __len__rT   rO   rP   s    r1   r^   zMiniMaxCache.__len__Y   s"    57?$c$*;*;&<==r2   c                     |t        | j                        k  r"| j                  |   g k7  r| j                  |   fS t        |   |      S rN   )rT   rO   r&   __getitem__)r-   rV   r0   s     r1   r`   zMiniMaxCache.__getitem__\   sM    s4,,--$2C2CI2NRT2T%%i022w"9--r2   c              #   L   K   t        t        |             D ]	  }| |     y wrN   )rS   rT   rZ   s     r1   __iter__zMiniMaxCache.__iter__a   s(     s4y) 	"Iy/!	"s   "$repeatsc                     t        t        |             D ]`  }| j                  |   g k7  r.| j                  |   j                  |d      | j                  |<   C| j                  |   j                  |       b y )Nr   dim)rS   rT   rO   repeat_interleavelayersbatch_repeat_interleave)r-   rc   rV   s      r1   ri   z$MiniMaxCache.batch_repeat_interleavee   ss    s4y) 	HI  +r1/3/@/@/K/]/]^ekl/]/m!!),I&>>wG		Hr2   indicesc                     t        t        |             D ]T  }| j                  |   g k7  r"| j                  |   |df   | j                  |<   7| j                  |   j	                  |       V y )N.)rS   rT   rO   rh   batch_select_indices)r-   rj   rV   s      r1   rl   z!MiniMaxCache.batch_select_indicesl   sk    s4y) 	EI  +r1/3/@/@/KGUXL/Y!!),I&;;GD		Er2   
max_lengthc                     t        d      )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r-   rm   s     r1   cropzMiniMaxCache.crops   s    GHHr2   )rF   rG   rH   r'   rX   intr[   r^   r`   rb   ri   r)   Tensorrl   rp   rI   rJ   s   @r1   rL   rL   I   s]    34# 
>.S .
"Hs HEELL EIs Ir2   rL   c                   >    e Zd Zdedef fdZd Zd Z	 	 ddej                  de
ej                  ej                  f   deej                     d	ee   d
eej                     dee   de
ej                  eej                     ee
ej                        f   fdZ xZS )MiniMaxLightningAttentionconfigrV   c                    t         |           || _        t        |dd       xs |j                  |j
                  z  | _        |j
                  | _        |j                  | _        |j                  | _        t        |j                     | _        t        | j                  | j
                  z        | _        t        j                  |j                  | j
                  | j                  z  dz  d      | _        t        j                  | j
                  | j                  z  |j                  d      | _        t        j                  |j                  | j
                  | j                  z  d      | _        | j'                         }| j)                  |      \  }}}| j+                  d|       | j+                  d|       | j+                  d|       | j+                  d|       y )	Nhead_dimr   Fbias
slope_ratequery_decay	key_decaydiagonal_decay)r&   r'   rV   getattrr.   num_attention_headsrw   num_hidden_layers
block_sizer	   
hidden_actact_fnr$   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r-   ru   rV   rz   r{   r|   r}   r0   s          r1   r'   z"MiniMaxLightningAttention.__init__x   s   "
D9mV=O=OSYSmSm=m#)#=#= !'!9!9 ++V../"4==43K3K#KL			&"4"4d6N6NQUQ^Q^6^ab6bino		$":":T]]"JFL^L^ejk99V%7%79Q9QTXTaTa9ahmn((*
151C1CJ1O.Y\:6]K8[)4-~>r2   c                     ddd| j                   z  z  z  }t        j                  | j                         dz   }d| j                  | j                  dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f   }|S )Nr    r4      gh㈵>)r   r)   arangerV   r   )r-   baseexponentfactorrates        r1   r   z(MiniMaxLightningAttention.get_slope_rate   s    A!d66678<< 8 89A=T^^t'='='AD'HIIDPX~f}AtTM"r2   c                    t        j                  | j                        dz   }t        j                  | |d d d f   z        }t        j                  | | j                  |d d d f   z
  z        }|d d d f   |d d d f   z
  }|d d d d d d f   }||z  }t        j                  |dk\  | t        d            }t        j                  |      }|||fS )Nr    r   z-inf)r)   r   r   expwherefloat)r-   rz   block_size_ranger{   r|   r}   s         r1   r   z'MiniMaxLightningAttention.decay_factors   s     <<81<ii.>q$w.G GHIIzkT__?OPQSWPW?X-XYZ	)!T'25EdAg5NN'dAq(89#n4^q%8>/5QW=Y>2I~55r2   r=   position_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    |j                   \  }}}	|| j                  z   dz
  | j                  z  }
| j                  | j                  |            }|j	                  ||| j
                  d| j                  z        }t        j                  || j                  d      \  }}}|j                  dd      }|j                  dd      }|j                  dd      }d }||j                  | j                        }|t        j                  || j
                  | j                  | j                        j                  |      }|Q|j                  t        j                        }|j                  |j!                  d      j!                  d       d      }g }t#        |
      D ]b  }|| j                  z  }t%        || j                  z   |      }||z
  }|d d d d ||f   }|d d d d ||f   }|d d d d ||f   }| j&                  d d d |f   }| j(                  d d | d f   }| j*                  d d d d d |d |f   }t        j,                  | j.                   |z        }t        j0                  ||j                  dd            }t        j0                  ||z  |      }t        j0                  ||z  |      }||z   }|j3                  |       t        j0                  ||z  j                  dd      |      } ||z  | z   }e nt        j,                  | j.                         }!g }t#        |      D ]  }|d d d d ||dz   f   }|d d d d ||dz   f   }|d d d d ||dz   f   }t        j0                  |j                  dd      |      }"|!|z  |"z   }t        j0                  ||      }|j3                  |        t        j4                  |d      }|j                  dd      }|j	                  ||| j
                  | j                  z        }| j7                  |      }t9        j:                  | j=                  |            |z  }| j?                  |      }||jA                  | j                  |       ||fS )	Nr    r   re   r4   r7   r5   r   )!rC   r   r   r   reshaper   rw   r)   split	transposer[   rV   zerosr8   boolmasked_fill	unsqueezerS   minr{   r|   r}   r   rz   matmulrU   catr   Fsigmoidr   r   rX   )#r-   r=   r   r   r   r   r   
batch_sizeseq_lenr.   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputi	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters#                                      r1   r@   z!MiniMaxLightningAttention.forward   s    ,9+>+>(
G[/!3G
[[}!=>
''
GT=U=UWX[_[h[hWhi
16Z\]1^.j,#--a3))!Q/
#--a3 "%!/!@!@!P%!&Z9Q9QSWS`S`bfbobo!p!s!s"
 )!/!2!2!2!D+779Q9QRS9T9^9^_a9b8bdefK:& `/	i$//97C%,y%8"'3Aq)G:K4K'L$%/1i6G0G%H"'3Aq)G:K4K'L$&*&6&6q:M;M:M7M&N#$(NN17I6I6J3J$K!)-)<)<QCVDVCVXkYkXk=k)l&#ii(8;M(MN &+\\2FHZHdHdegikHl%m"$)LL1CF\1\^r$s! %*LL1EH[1[]o$p! '8:K&K#""#67 +0,,'*;;FFr2NPd+' &8+%EH_%_";`@ IIt./EK7^ 	8'3Aq!a!e)O'D$%/1a!a%i%@"'3Aq!a!e)O'D$-2\\:L:V:VWY[]:^`t-u*%*-?%?B\%\"&+ll3GI[&\#""#67	8 ii4 "++Aq1!))*gt?W?WZ^ZgZg?ghii,ii 0 0 ?@;NmmK0 %++DNN<NO...r2   NN)rF   rG   rH   r!   rq   r'   r   r   r)   rr   rB   r   r
   
LongTensorr   r   r@   rI   rJ   s   @r1   rt   rt   w   s    ?} ? ?,	6& +/59`/||`/ #5<<#=>`/ !.	`/
 !`/ !!1!12`/ -.`/ 
u||Xell3XeELL>Q5RR	S`/r2   rt   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr5   r4   re   )rC   r)   r   )xx1x2s      r1   rotate_halfr   
  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r1   apply_rotary_pos_embr     sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr2   r=   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rC   expandr   )r=   r   batchnum_key_value_headsslenrw   s         r1   	repeat_kvr   ,  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr2   modulequerykeyvaluer   scalingdropoutr   c                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr4   r   r   r5   rf   r7   )ptrainingr    )r   num_key_value_groupsr)   r   r   rC   r   
functionalsoftmaxr9   r8   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskr   s                r1   eager_attention_forwardr   8  s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r2   c                   6    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   de
ej                     de
e   d	e
ej                     d
ee   de	ej                  e
ej                     e
e	ej                        f   fdZ xZS )MiniMaxAttentionz=Multi-headed attention from 'Attention Is All You Need' paperru   rV   c                    t         |           || _        || _        t	        |dd       xs |j
                  |j                  z  | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nrw   g      TFrx   )r&   r'   ru   rV   r~   r.   r   rw   r   r   r   attention_dropout	is_causalr   r   q_projk_projv_projo_projr-   ru   rV   r0   s      r1   r'   zMiniMaxAttention.__init__U  s2   "
D9mV=O=OSYSmSm=m$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr2   r=   r   r   r   r   r   r   c           
      `   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   t#        | j                  dd       d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )	Nr5   r    r4   )r   r   r   eager        sliding_window)r   r   r   )rC   rw   r   viewr   r   r   r   updaterV   r   ru   _attn_implementationr   r   r   r   r~   r   r   r   )r-   r=   r   r   r   r   r   input_shapehidden_shaper   r   r   r   r   cache_kwargsattention_interfacer   r   s                     r1   r@   zMiniMaxAttention.forwardc  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r2   r   )rF   rG   rH   __doc__r!   rq   r'   r)   rr   rB   r   r
   r   r   r   r@   rI   rJ   s   @r1   r   r   R  s    Gl} l l& +/59*)||*) #5<<#=>*) !.	*)
 !*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*)r2   r   c                   *     e Zd Zdef fdZd Z xZS )MiniMaxBlockSparseTop2MLPru   c                    t         |           |j                  | _        |j                  | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j
                  d      | _	        t        j                  | j
                  | j                  d      | _
        t        |j                     | _        y NFrx   )r&   r'   intermediate_sizeffn_dimr.   
hidden_dimr   r   w1w2w3r	   r   r   r-   ru   r0   s     r1   r'   z"MiniMaxBlockSparseTop2MLP.__init__  s    // ,,))DOOT\\F))DLL$//F))DOOT\\FV../r2   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }|S rN   )r   r  r  r  )r-   r=   current_hidden_statess      r1   r@   z!MiniMaxBlockSparseTop2MLP.forward  s>     $DGGM,B CdggmF\ \ $(= >$$r2   )rF   rG   rH   r!   r'   r@   rI   rJ   s   @r1   r  r    s    	0} 	0%r2   r  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )MiniMaxSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        j                  | j                  | j                  d      | _        t        j                  t        | j                        D cg c]  }t        |       c}      | _        |j"                  | _        y c c}w r
  )r&   r'   r.   r  r  r  num_local_expertsnum_expertsnum_experts_per_toktop_kr   r   gate
ModuleListrS   r  expertsrouter_jitter_noisejitter_noise)r-   ru   rW   r0   s      r1   r'   zMiniMaxSparseMoeBlock.__init__  s     ,,//!33//
 IIdoot/?/?eL	}}QVW[WgWgQh%iA&?&G%ij #66 &js   +Cr=   r   c                    |j                   \  }}}| j                  rQ| j                  dkD  rB|t        j                  |      j                  d| j                  z
  d| j                  z         z  }|j                  d|      }| j                  |      }t        j                  |dt        j                        }t        j                  || j                  d      \  }}||j                  dd      z  }|j                  |j                        }t        j                   ||z  |f|j                  |j"                  	      }t        j$                  j&                  j)                  || j*                  
      j-                  ddd      }	t        j.                  |	j                  d      d      j1                         }
|
D ]  }| j2                  |   }t        j4                  |	|   j7                  d            \  }}|d|f   j9                  d|      } ||      |||df   z  }|j;                  d||j                  |j                                |j9                  |||      }||fS ) r   g      ?r5   r    r   re   T)rf   r6   )r7   device)num_classesr4   )r5   r   N)rC   r   r  r)   
empty_likeuniform_r   r  r   r   r   topkr  sumr8   r7   r   r"  r   r   one_hotr  permutegreaternonzeror  r   squeezer   
index_add_)r-   r=   r   sequence_lengthr  router_logitsrouting_weightsselected_expertsfinal_hidden_statesexpert_maskexpert_hitted
expert_idxexpert_layeridxtop_xcurrent_stater  s                    r1   r@   zMiniMaxSparseMoeBlock.forward  s7   2?2E2E/
OZ==T..2U--m<EEcDL]L]F]_beievev_vwwM%**2z:		-0))MqL,1JJ

XZ,[))?..2t.DD),,]-@-@A#kk/):6m>Q>QZgZnZn
 hh))112BPTP`P`1aiijkmnpqrkoo(o&CQGOOQ' 	dJ<<
3L[%<%D%DQ%GHJC *$+6>>r:NM$0$?/RWY\^bRbBc$c!  **1e5J5M5MmNaNa5bc	d 299*oWab"M11r2   )	rF   rG   rH   r  r'   r)   rr   r@   rI   rJ   s   @r1   r  r    s(    	7%2U\\ %2ell %2r2   r  c                       e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	ej                     de	eej                        d	e	e   d
e	e   de	e   de	ej                     dee   deej                  e	eej                  ej                  f      f   fdZ xZS )MiniMaxDecoderLayerru   rV   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        || _        |j                  |   | _        |j                  | _        |j                  | _        | j                  dk(  r4t!        ||      | _        |j"                  | _        |j&                  | _        y t        ||      | _        |j*                  | _        |j,                  | _        y )Nr/   linear_attention)r&   r'   r.   r   	self_attnr  block_sparse_moer$   rms_norm_epsinput_layernormpost_attention_layernormrV   layer_types
layer_typemlp_alpha_factormlp_beta_factorrt   linear_attn_alpha_factorattn_alpha_factorlinear_attn_beta_factorattn_beta_factorfull_attn_alpha_factorfull_attn_beta_factorr   s      r1   r'   zMiniMaxDecoderLayer.__init__  s   !--)&)< 5f =-f.@.@fFYFYZ(6v7I7IvObOb(c%" ,,Y7 & 7 7%55??006vyIDN%+%D%DD"$*$B$BD!-fi@DN%+%B%BD"$*$@$@D!r2   r=   r   r   r   r   output_attentionsoutput_router_logits	use_cacher   r   r   c
                 2   | j                  |      }|} | j                  d||||||||	d|
\  }}|| j                  z  || j                  z  z   }| j	                  |      }|}| j                  |      \  }}|| j                  z  || j                  z  z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r=   r   r   r   r   rN  rP  r    )rB  r?  rI  rK  rC  r@  rF  rG  )r-   r=   r   r   r   r   rN  rO  rP  r   r   residualrW   s                r1   r@   zMiniMaxDecoderLayer.forward  s    L ,,];  *4>> 

' 3)%)/)

 

q !4#9#99MDLaLa<aa 55mD 00?q 4#8#88=4K_K_;__r2   )NNNFFFN)rF   rG   rH   r!   rq   r'   r)   rr   rB   r   r   r   r   r   FloatTensorr@   rI   rJ   s   @r1   r;  r;    s!   A} A A8 26378<,1/4$)59=||= #5<<#=>= !.	=
 u//0= !u||!45= $D>= 'tn= D>= !!1!12= -.= 
u  (51B1BEDUDU1U+V"WW	X=r2   r;  c                   `    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed      eeegd	Zy
)MiniMaxPreTrainedModelru   modelTr;  past_key_valuesFr    )index)r/  r=   
attentionsN)rF   rG   rH   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r  r;  r   rt   _can_record_outputsrR  r2   r1   rV  rV  >  sb    &*#./#4"5N""&'(=QG,')BCr2   rV  c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )MiniMaxRotaryEmbeddingru   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r&   r'   hasattr
isinstanceri  dictgetrj  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenru   r   rope_init_fnattention_scalingr   rm  original_inv_freq)r-   ru   r"  rm  r0   s       r1   r'   zMiniMaxRotaryEmbedding.__init__R  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r2   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r5   r    mpscpuF)device_typeenabledr4   re   r   )rm  r   r   rC   r8   r"  rp  rk  strr)   autocastr   r   r   rw  r   r7   )
r-   r   r   inv_freq_expandedposition_ids_expandedr|  freqsembr   r   s
             r1   r@   zMiniMaxRotaryEmbedding.forwardc  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.rN   )
rF   rG   rH   r!   r'   r)   no_gradr   r@   rI   rJ   s   @r1   rg  rg  Q  s3    /} /" U]]_<  <r2   rg  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e   de	ej                     de	e   d	e	e   d
e	ej                     dee   defd              Z xZS )MiniMaxModelru   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr=  )ru   F)r&   r'   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokensr  rS   r   r;  rh   r$   rA  r   rg  
rotary_embgradient_checkpointing	post_initr   s      r1   r'   zMiniMaxModel.__init__u  s     !.. ++LL):):F<N<NPTP`P`ammEJ6KcKcEde	 3e
 #6#5#56;N;NO	0?&+# 	 fs   D	input_idsr   r   rX  inputs_embedsrP  rN  r   r   r   c	                    |d u |d uz  rt        d      |r|t               }n*|r(t        |t              st        dt        |       d      || j	                  |      }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        } || j                  |||||      }|}| j                  ||      }| j                   D ]&  }|j"                  dk(  r|}n|} ||f||||||d	|	}( | j%                  |      }t'        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r    )r"  )ru   input_embedsr   r   rX  r   full_attention)r   r   r   r   rP  r   )last_hidden_staterX  )
ValueErrorrL   rp  rk  r  get_seq_lengthr)   r   rC   r"  r   ru   r   r   r   r  rh   rE  r   r   )r-   r  r   r   rX  r  rP  rN  r   r   past_seen_tokensmask_functionr   r=   r   decoder_layerinput_attention_masks                    r1   r@   zMiniMaxModel.forward  s    -t";<YZZ0*nOz/<Hefjkzf{e||}~    --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oom\J![[ 	M''+;;'2$ (6$)	$73).#-	 	M	$ 		-0%++
 	
r2   )NNNNNNNN)rF   rG   rH   r!   r'   r   r   r)   r   r   rr   rL   rT  r   r   r   r   r@   rI   rJ   s   @r1   r  r  s  s    }    '+15372659$(,059G
##G
 !.G
 u//0	G

 ",/G
   1 12G
 D>G
 $D>G
 !!1!12G
 +,G
 
 G
  G
r2   r  gate_logitsr  c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   re   r5   )rp  rB   r"  r)   r   r8   r   r   r   r&  r(  r;   r   rC   r   r   r'  r   )r  r  r  r   compute_device
layer_gateconcatenated_gate_logitsr0  rW   r1  r3  tokens_per_expertrouter_prob_per_expertr   r.  r   expert_attention_mask router_per_expert_attention_maskoverall_losss                      r1   load_balancing_loss_funcr    s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                   |    e Zd ZdgZddiZddgdgfiZ fdZd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     dee   deej                      deej                     dee   dee   deej                     deeej                  f   dee   defd              Z xZS )MiniMaxForCausalLMzlm_head.weightlm_headcolwise_repr=   logitsc                 N   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _        |j                  | _        | j                          y r
  )r&   r'   r  rW  r  r   r   r.   r  router_aux_loss_coefr  r  r  r  r  s     r1   r'   zMiniMaxForCausalLM.__init__)  s     !&)
 ++yy!3!3V5F5FUS$*$?$?!!33#)#=#=  	r2   c                     || _         y rN   rW  )r-   decoders     r1   set_decoderzMiniMaxForCausalLM.set_decoder5  s	    
r2   c                     | j                   S rN   r  rD   s    r1   get_decoderzMiniMaxForCausalLM.get_decoder8  s    zzr2   r  r   r   rX  r  labelsrP  rO  r   logits_to_keepr   r   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  r   r   rX  r  rP  rO  r   )lossaux_lossr  rX  r=   rZ  r/  rR  )ru   rO  rW  r  rp  rq   slicer  loss_functionr  r  r/  r  r  r  r8   r"  r   rX  r=   rZ  )r-   r  r   r   rX  r  r  rP  rO  r   r  r   outputsr=   slice_indicesr  r  r  s                     r1   r@   zMiniMaxForCausalLM.forward;  sX   P %9$D $++JjJj 	
 +5$** 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r2   )
NNNNNNNNNr   )rF   rG   rH   _tied_weights_keys_tp_plan_pp_planr'   r  r  r   r   r   r)   r   rr   r
   rT  r   r   rq   r   r   r   r@   rI   rJ   s   @r1   r  r  #  sY   *+=)H_-z:;H
  151537+/59-1$(/35934R
E,,-R
 !.R
 u//0	R

 "%R
   1 12R
 ))*R
 D>R
 'tnR
 !!1!12R
 c5<</0R
 +,R
 
#R
  R
r2   r  c                       e Zd Zy) MiniMaxForSequenceClassificationNrF   rG   rH   rR  r2   r1   r  r        r2   r  c                       e Zd Zy)MiniMaxForTokenClassificationNr  rR  r2   r1   r  r    r  r2   r  c                       e Zd Zy)MiniMaxForQuestionAnsweringNr  rR  r2   r1   r  r    r  r2   r  )rV  r  r  r  r  r  rR   )r   )Nr4   N)Jtypingr   r   r   r)   torch.nn.functionalr   r   r   transformers.utils.genericr   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_minimaxr!   Moduler$   rL   rt   r   r   rr   rq   r   r   r   r   r  r  r;  rV  rg  r  rB   r  r  r  r  r  __all__rR  r2   r1   <module>r     s  . - ,     9 ! . ) 7 R B  R K F & I I + 0 Y'JRYY J (J(+I< +I\P/		 P/f(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4;)ryy ;)|%		 %$@2BII @2FV4 Vr _  $<RYY <D Z
) Z
 Z
~ "&
-1	O&u||U5<<%8$>?O&#O& U\\*	O&
 5<<O&d k
/ k
 k
\	'GI_ 		$ACY 		"=?U 	r2   