
    rhD                        d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	c m
Z d dlm	Z	 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)  e$       rd dl*m+Z+ d dl,m-Z- d dl.m/Z/ ne0Z- e%jb                  e2      Z3 G d dejh                  jj                        Z6	 	 dTdeejn                     dee8   fdZ9 G d de-      Z: G d de	jv                        Z< G d de	jv                        Z= G d d e	jv                        Z>d! Z?dUd"Z@	 dVd#d$d%ejn                  d&ejn                  d'ejn                  d(eej                     d)eBe8e8f   d*e8d+e8d,eeC   d-eeBejn                  ejn                  f   eBejn                     f   fd.ZDej                  fd#d$d%ejn                  d/e:dejn                  de8d)eBe8e8f   d*e8d+e8d0ej                  d-eBejn                     fd1ZGd#d$d%ejn                  d&ejn                  d'ejn                  d(eej                     d)eBe8e8f   d*e8d+e8d-eBejn                     fd2ZHeGeDeHd3ZI G d4 d$e	jv                        ZJ G d5 d6e      ZKe# G d7 d8e!             ZL	 	 dTd9ejn                  d&ejn                  d(eejn                     d:eejn                     d-eBejn                  ejn                  ejn                  e8eejn                     eejn                     f   f
d;ZMd9ejn                  d<ejn                  d=e8d>e8d-ejn                  f
d?ZNe# G d@ dAeL             ZO G dB dCe	jv                        ZP e#dDE       G dF dGeL             ZQ e#dHE       G dI dJeL             ZR e#dKE       G dL dMeL             ZSe# G dN dOeL             ZT e#dPE       G dQ dReL             ZUg dSZVy)W    N)nullcontext)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                   \    e Zd Ze	 	 ddeej                     dee   fd       Zed        Z	y)ApplyRotaryEmbUnpadN
cu_seqlens
max_seqlenc           
          |j                         }|j                  \  }}}}	|d d d df   j                  |d|	      }
t        |
||d||dd       | j	                  |||       || _        |S )N   r   FT)seqlen_offsetsr"   r#   interleavedinplace)
contiguousshapeviewr   save_for_backwardr#   )ctxqkvcossinr"   r#   	total_nnz_three_nheadsheaddimqks              /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.pyforwardzApplyRotaryEmbUnpad.forward>   s     nn.1ii+	67G BQBZ__YG4!!		
 	c3
3#
    c                     | j                   \  }}}|j                         }|j                  \  }}}}|d d d df   j                  |d|      }	t	        |	||d|| j
                  ddd	       |d d d d d d fS )Nr%   r&   r   FT)r'   r"   r#   r(   r)   	conjugate)saved_tensorsr*   r+   r,   r   r#   )
r.   dor0   r1   r"   r2   r3   r4   r5   dqks
             r7   backwardzApplyRotaryEmbUnpad.backward]   s    "00S*]]_.0hh+	67G BQBinnYG4!~~
	
 4tT455r9   NN)
__name__
__module____qualname__staticmethodr   torchTensorintr8   r?    r9   r7   r!   r!   =   sQ     .2$(
 U\\* SM < 6 6r9   r!   r"   r#   c                 4    t         j                  | ||||      S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r!   apply)r/   r0   r1   r"   r#   s        r7   apply_rotary_unpaddedrK   t   s    . $$S#sJ
KKr9   c                   "    e Zd ZdZ	 	 	 	 ddededee   deej                     deej                     f
 fdZ
	 ddej                  d	ej                  dee   d
eej                  eej                  ej                  f   f   fdZd
efdZ xZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
    dimbaser#   devicedtypec                 t    t         |   |||d       || _        |||| j                  |||       yyyy)a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        F)rN   rO   rP   r(   NrP   rQ   )super__init__r#   _update_cos_sin_cache)selfrN   rO   r#   rP   rQ   	__class__s         r7   rU   z*ModernBertUnpaddedRotaryEmbedding.__init__   sS     	StFN$!f&8U=N&&z&&N >O&8!r9   r/   r"   returnc                     |(| j                  ||j                  |j                         t        || j                  | j
                  ||      }|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        rS   r"   r#   )rV   rP   rQ   rK   _cos_cached_sin_cached)rW   r/   r"   r#   s       r7   r8   z)ModernBertUnpaddedRotaryEmbedding.forward   sS     !&&z#**CII&V#!!
 
r9   c                 T    d| j                    d| j                   d| j                   S )Nzdim=z, base=z, scale_base=)rN   rO   
scale_baserW   s    r7   
extra_reprz,ModernBertUnpaddedRotaryEmbedding.extra_repr   s(    dhhZwtyykt>OPPr9   )g     @NNNN)rA   rB   rC   __doc__rG   floatr   rE   rP   rQ   rU   rF   r   tupler8   strra   __classcell__rX   s   @r7   rM   rM      s     $()-'+OO O SM	O
 &O $O. %)	\\ LL SM	
 
u||U5<<#=>>	?2QC Qr9   rM   c                        e Zd ZdZdef fdZ ej                  d      dej                  dej                  fd       Z
	 ddeej                     d	eej                     dej                  fd
Z xZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                 d   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxepsbias)rT   rU   rk   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdroprW   rk   rX   s     r7   rU   zModernBertEmbeddings.__init__   sw     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	r9   Tdynamic	input_idsrY   c                 `    | j                  | j                  | j                  |                  S rb   )r|   ry   ru   )rW   r   s     r7   compiled_embeddingsz(ModernBertEmbeddings.compiled_embeddings   s%    yy4#6#6y#ABCCr9   inputs_embedsc                     |"| j                  | j                  |            }|S | j                  j                  r| j	                  |      n.| j                  | j                  | j                  |                  }|S rb   )r|   ry   rk   reference_compiler   ru   )rW   r   r   hidden_statess       r7   r8   zModernBertEmbeddings.forward   su     $ IIdii&>?M  ;;00 ((3YYtyy)<)<Y)GHI 
 r9   r@   )rA   rB   rC   rc   r   rU   rE   compile
LongTensorrF   r   r   r8   rg   rh   s   @r7   rj   rj      s    9/ 9 U]]4 DU-=-= D%,, D !D ei!%"2"23KSTYT`T`Ka	r9   rj   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    rk   c                    t         |           || _        t        j                  |j
                  t        |j                        dz  |j                        | _	        t        |j                     | _        t        j                  |j                        | _        t        j                  |j                  |j
                  |j                        | _        y )Nr%   rp   )rT   rU   rk   r   Linearrs   rG   intermediate_sizemlp_biasWir   hidden_activationactrz   mlp_dropoutr|   Wor}   s     r7   rU   zModernBertMLP.__init__   s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_r9   r   rY   c                     | j                  |      j                  dd      \  }}| j                  | j                  | j	                  |      |z              S )Nr%   r&   rN   )r   chunkr   r|   r   )rW   r   inputgates       r7   r8   zModernBertMLP.forward   sI    ggm,221"2=twwtyy%4!7899r9   )
rA   rB   rC   rc   r   rU   rE   rF   r8   rg   rh   s   @r7   r   r      s2    `/ `:U\\ :ell :r9   r   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )ModernBertRotaryEmbeddingrk   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)rT   rU   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrk   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)rW   rk   rP   r   rX   s       r7   rU   z"ModernBertRotaryEmbedding.__init__   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r9   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r&   r   mpscpuF)device_typeenabledr%   r   )rQ   )r   rd   expandr+   torP   r   r   rf   rE   autocast	transposecatr0   r   r1   rQ   )
rW   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr0   r1   s
             r7   r8   z!ModernBertRotaryEmbedding.forward  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.rb   )
rA   rB   rC   r   rU   rE   no_gradr   r8   rg   rh   s   @r7   r   r      s4    // /" U]]_<  <r9   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr&   r%   r   )r+   rE   r   )r   x1x2s      r7   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r9   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr0   r1   r   unsqueeze_dimq_embedk_embeds           r7   apply_rotary_pos_embr     sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr9   moduleModernBertAttentionr/   attention_masksliding_window_maskr   local_attentionbsrN   output_attentionsrY   c	                    | j                  ||      \  }
}|j                  dd      j                  d      \  }}}t        |||
|      \  }}| j                  dz  }t        j                  ||j                  dd            |z  }|dk7  r|}||z   }t        j                  j                  |dt
        j                  	      j                  |j                        }t        j                  j                  || j                  | j                  
      }t        j                  ||      }|j                  dd      j!                         }|j#                  |d|      }|r||fS |fS )Nr   r
   r   r%   r         ࿩r&   r&   r&   rN   rQ   )ptraining)
rotary_embr   unbindr   head_dimrE   matmulr   
functionalsoftmaxfloat32r   rQ   dropoutattention_dropoutr   r*   r,   )r   r/   r   r   r   r   r   rN   r   _kwargsr0   r1   querykeyvaluescaleattn_weightsattn_outputs                     r7   eager_attention_forwardr   9  sK      < @HCa+22q29E3%eS#s;JE3OOT!E<<s}}Q':;eCL(",.0L ==((2U]](SVVW\WbWbcL==((9Q9Q\b\k\k(lL,,|U3K''1-88:K""2r3/K\**>r9   r   target_dtypec	                     ||||      }|j                   t        j                  t        j                  fv}
|
rb|j                   }|j	                  |      }t        |||| j                  r| j                  nd| j                  |      }|j	                  |      }n3t        |||| j                  r| j                  nd| j                  |      }|j                  ||      fS )Nr[           )r"   r#   	dropout_pdeterministicwindow_size)
rQ   rE   float16bfloat16r   r   r   r   deterministic_flash_attnr,   )r   r/   r   r"   r#   r   r   rN   r   r   convert_dtype
orig_dtypeattns                r7   flash_attention_forwardr   ^  s     SZJ
GCIIemmU^^%DDM YY
ff\"/!!28//f..s 99'
 wwz"/!!28//f..s 99'
 IIb#  r9   c                 v   | j                  ||      \  }	}
|j                  dd      j                  d      \  }}}t        |||	|
      \  }}|dk7  r|}t	        j
                  |||| j                  r| j                  nd|      j                  dd      j                         }|j                  |d	|      }|fS )
Nr   r
   r   r%   r   r   r   )r   	attn_maskr&   )
r   r   r   r   Fscaled_dot_product_attentionr   r   r*   r,   )r   r/   r   r   r   r   r   rN   r   r0   r1   r   r   r   r   s                  r7   sdpa_attention_forwardr     s       < @HCa+22q29E3%eS#s;JE3(", 	
&&28//f..s$	
 
1a	  ""2r3/K>r9   )flash_attention_2eagersdpac                   z     e Zd ZdZd	dedee   f fdZ	 d
dej                  dee
   dej                  fdZ xZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    rk   layer_idc                    t         |           || _        || _        |j                  |j
                  z  dk7  r&t        d|j                   d|j
                   d      |j                  | _        |j                  | _        |j
                  | _	        |j                  |j
                  z  | _
        | j                  | j                  z  | _        t        j                  |j                  d| j                  z  |j                        | _        ||j                   z  dk7  rT|j"                  dz  |j"                  dz  f| _        |j$                  |j$                  n|j&                  }|j"                  }nd| _        |j(                  }|j&                  }|j*                  d	k(  rt-        | j                  ||
      | _        n-t1        j2                  |      }||_        t7        |      | _        t        j                  |j                  |j                  |j                        | _        |j                  dkD  rt        j:                  |j                        nt        j<                         | _        tA               | _!        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   r   r%   r   r   )rN   r#   rO   )rk   r   )"rT   rU   rk   r   rs   num_attention_heads
ValueErrorr   r   	num_headsr   all_head_sizer   r   attention_biasWqkvglobal_attn_every_n_layersr   local_rope_thetaglobal_rope_thetar   _attn_implementationrM   r   copydeepcopy
rope_thetar   r   rz   Identityout_dropsetpruned_heads)rW   rk   r   r  r   config_copyrX   s         r7   rU   zModernBertAttention.__init__  s'     : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%33**f.H.HH!]]T^^;IIf00!d6H6H2HvOdOde	f7771<$*$:$:a$?AWAW[\A\#]D 4:4K4K4W00]c]u]uJ&,&<&<##+D &,&D&D#11J&&*==?MM.EJDO --/K%/K"7{KDO))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqEr9   r   r   rY   c           
         | j                  |      }|j                  d   }| j                  j                  dk(  r)|j	                  dd| j
                  | j                        }n)|j	                  |dd| j
                  | j                        }t        | j                  j                     | f|| j                  | j                  || j                  |d|}|d   }| j                  | j                  |            }|f|dd  z   S )Nr   r   r&   r
   )r/   r   r   r   rN   r   r   )r  r+   rk   r  r,   r  r   MODERNBERT_ATTENTION_FUNCTIONr   r   r  r  r   )rW   r   r   kwargsr/   r   attn_outputss          r7   r8   zModernBertAttention.forward  s     ii&  #;;++/BB((2q$..$--@C((2r1dnndmmDC4T[[5U5UV	
 00""/	
 	
 %Qdggm&<=,qr"222r9   rb   F)rA   rB   rC   rc   r   r   rG   rU   rE   rF   boolr8   rg   rh   s   @r7   r   r     sS    %"/ %"8C= %"T -23||3 $D>3
 
3r9   c                   f    e Zd Zddedee   f fdZ ej                  d      dej                  dej                  fd       Z
	 	 	 	 	 	 ddej                  d	eej                     d
eej                     deej                     deej                     dee   dee   dej                  fdZ xZS )ModernBertEncoderLayerrk   r   c                    t         |           || _        |dk(  rt        j                         | _        n;t        j                  |j                  |j                  |j                        | _        t        ||      | _        t        j                  |j                  |j                  |j                        | _        t        |      | _        y )Nr   rn   )rk   r   )rT   rU   rk   r   r  	attn_normrv   rs   rw   rx   r   r   mlp_normr   mlprW   rk   r   rX   s      r7   rU   zModernBertEncoderLayer.__init__  s    q=[[]DN\\&*<*<&//X^XhXhiDN'vI	V%7%7V__SYScScd (r9   Tr~   r   rY   c                 B    | j                  | j                  |            S rb   )r   r  rW   r   s     r7   compiled_mlpz#ModernBertEncoderLayer.compiled_mlp  s    xxm455r9   r   r   r   r"   r#   r   c           	      
   | j                  | j                  |      ||||||      }||d   z   }| j                  j                  r| j	                  |      n| j                  | j                  |            }	||	z   }|f|dd  z   S )Nr   r   r   r"   r#   r   r   r   )r   r  rk   r   r$  r   r  )
rW   r   r   r   r   r"   r#   r   r  
mlp_outputs
             r7   r8   zModernBertEncoderLayer.forward  s     yyNN=)) 3%!!/ ! 
 &Q7 {{,, m,$--67 	
 &
2,qr"222r9   rb   )NNNNNF)rA   rB   rC   r   r   rG   rU   rE   r   rF   r$  r   r  r8   rg   rh   s   @r7   r  r    s    	)/ 	)8C= 	) U]]4 6%,, 65<< 6 !6 266:37-1$(,13||3 !.3 &ell3	3
 u//03 U\\*3 SM3 $D>3 
3r9   r  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dej                  fdZ	 dd	ee   d
edef fdZd Z fdZ xZS )ModernBertPreTrainedModelrk   modelTrj   r  Fr   c                    | j                   j                  ddt        j                  dt        ffd}| j                   j
                  | j                   j
                  t        j                  d| j                   j                  z        z  | j                   j
                  | j                   j                  dz  d}t        |t              r ||j                  |d          y t        |t              r- ||j                  |d	           ||j                  |d
          y t        |t               r- ||j"                  |d	           ||j                  |d
          y t        |t$              r ||j&                  |d
          y t        |t(              r ||j*                  |d
          y t        |t,        t.        t0        t2        f      r ||j4                  |d          y t        |t        j6                        rW|j8                  j:                  j=                  d       |j>                  %|j>                  j:                  jA                          y y y )Nr
   r   stdc                    t         j                  j                  | j                  d| |z  |z         t	        | t         j
                        r7| j                  *t         j                  j                  | j                         y y y )Nr   )meanr,  ab)r   inittrunc_normal_weightr   r   rp   zeros_)r   r,  cutoff_factors     r7   init_weightz<ModernBertPreTrainedModel._init_weights.<locals>.init_weight?  sq    GG!! .3&#% "  &")),;;*GGNN6;;/ + -r9   g       @r   )inout	embedding	final_outr9  r7  r8  r:  g      ?)!rk   initializer_cutoff_factorr   Modulerd   initializer_rangemathsqrtnum_hidden_layersrs   r   rj   ru   r   r   r   r   r  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierrv   r3  datafill_rp   zero_)rW   r   r6  stdsr5  s       @r7   _init_weightsz'ModernBertPreTrainedModel._init_weights:  s   == M	0		 	0 	0 ++//;;00499S4;;C`C`=`3aa6600$6	
 f23--tK/@A.		4:.		4;/ 34T$Z0		4;/ 89d5k2 56U43+0.	
 ))4+<=-MM$$S){{&  &&( ' .r9   attn_implementationis_init_checkrY   c                 ~    	 || j                         rdn|}t        |   ||      S # t        t        f$ r Y "w xY w)zR
        Checks and dispatches to hhe requested attention implementation.
        r   )rO  rP  )_flash_attn_2_can_dispatchr  ImportErrorrT   %_check_and_adjust_attn_implementation)rW   rO  rP  rX   s      r7   rT  z?ModernBertPreTrainedModel._check_and_adjust_attn_implementationn  s_    	 '.43R3R3T $(   w< 3= = 
 	
 K( 		s   * <<c                    | j                   j                  du ry t        | d      rTt        | j                        dkD  r<| j                   j                  rt
        j                  d       d| j                   _        | j                  j                  dk(  r<| j                   j                  rt
        j                  d       d| j                   _        | j                  j                  dk(  r<| j                   j                  rt
        j                  d       d| j                   _        | j                   j                  t               | j                   _        y y )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
rk   r   r   lenrV  loggerwarning_oncerP   r   r   r`   s    r7   _maybe_set_compilez,ModernBertPreTrainedModel._maybe_set_compile  s   ;;((E14)c$2D2D.E.I{{,,##9 -2DKK);;u${{,,##9 -2DKK);;u${{,,##9 -2DKK);;((0,?,ADKK) 1r9   c                     t        |   |i |}| j                  j                  dv r<| j                  j                  rt        j                  d       d| j                  _        |S )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rT   resize_token_embeddingsrk   r   rX  rY  )rW   argsr  model_embedsrX   s       r7   r\  z1ModernBertPreTrainedModel.resize_token_embeddings  s[    w6GG;;((L8{{,,##y -2DKK)r9   r  )rA   rB   rC   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r<  rN  r   rf   r  rT  rZ  r\  rg   rh   s   @r7   r)  r)  0  s{    &*#/1IJN2)BII 2)j IN
#+C=
AE
	
.B>
 
r9   r)  inputslabelsc                    |j                  dt        j                        }t        j                  |j	                         d      j	                         }t        |j                         j                               }t        j                  j                  j                  t        j                  |dt        j                        d      }| j                         dk(  r| j	                         |   }n*| j                  ^}	}
}|	|
z  } | j                  |g| |   }||j	                         |   nd}||j	                         |   nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    r&   r   F)as_tupler   )r   r   r%   N)sumrE   int32nonzeroflattenrG   maxitemr   r   padcumsumrN   r+   r,   )rf  r   r   rg  seqlens_in_batchindicesmax_seqlen_in_batchr"   unpadded_inputsbatchseqlenrestr+   unpadded_position_idsunpadded_labelss                  r7   _unpad_modernbert_inputr{    s,   . &))b)DmmN224uEMMOG.22499;<$$((6FAUZU`U`)acijJzz|q ..*73%||v%&++e3d3G<?K?WL0027;]a393Efnn&w/4OGZ1DF[]lllr9   rs  rv  rw  c                 l   | j                         dk(  rHt        j                  ||z  | j                  | j                        }| ||<   |j                  ||      }|S | j                  ^}}t        j                  ||z  g|| j                  | j                  d}| ||<    |j
                  ||g| }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r   )rQ   rP   )rN   rE   zerosrQ   rP   r,   r+   )rf  rs  rv  rw  outputpadded_inputs_rx  s           r7   _pad_modernbert_outputr    s    $ zz|qUV^6<<V wE62  <<DUV^]d]&,,v}}] w#E69D9r9   c            !           e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee   dee   dee   dee   dee   dee   deee	j                  df   ef   fd       Zde	j                  dede	j                  fdZ xZS )ModernBertModelrk   c           	         t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        j                  |j                  |j                  |j                        | _        d| _        | j#                          y c c}w )Nrn   F)rT   rU   rk   rj   
embeddingsr   
ModuleListranger@  r  layersrv   rs   rw   rx   
final_normgradient_checkpointing	post_initr!  s      r7   rU   zModernBertModel.__init__  s     .v6mmFKFLdLdFef(#FH5f
 ,,v'9'9vU[UeUef&+#	 gs   C c                 .    | j                   j                  S rb   r  ru   r`   s    r7   get_input_embeddingsz$ModernBertModel.get_input_embeddings  s    ---r9   c                 &    || j                   _        y rb   r  )rW   r   s     r7   set_input_embeddingsz$ModernBertModel.set_input_embeddings	  s    ).&r9   r   r   r   r   r   rs  r"   r#   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrY   .c           
        	
 ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt	        d      |rdnd}|rdnd}| j                          || j                  ||       	)
'||j                  dd \  	
n|j                  dd \  	
||j                  n|j                  }|(t        j                  	
f|t        j                        }d}| j                   j                  dk(  rM||d}|0t        j                         5  t        ||	      ^}}}}ddd       nQt        ||	      ^}}}}n>|&t        j                  
|
      j!                  d      }| j#                  ||      \  }}| j%                  ||      }| j&                  D ]9  }|r||fz   } ||||||||      }|d   }|s"t)        |      dkD  s1||d   fz   }; |r||fz   }| j+                  |      }|r't-        |	
      }|t/        	
fd|D              }|st/        d |||fD              S t1        |||      S # 1 sw Y   xY w)  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsrH   r%   rS   Fr   T)rf  r   )rP   r   )r   )r   r   r&  r   rf  rs  rv  rw  c              3   <   K   | ]  }t        |         yw)r  N)r  ).0hsr  rs  r  s     r7   	<genexpr>z*ModernBertModel.forward.<locals>.<genexpr>z  s(      * +"gZ`ghh*s   c              3   &   K   | ]	  }||  y wrb   rH   )r  vs     r7   r  z*ModernBertModel.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater   
attentions)rk   r   r  use_return_dictr  rZ  %warn_if_padding_and_no_attention_maskr+   rP   rE   onesr  r  r   r{  aranger   _update_attention_maskr  r  rW  r  r  re   r   )rW   r   r   r   r   r   rs  r"   r#   r  r  r   r  r  all_hidden_statesall_self_attentionsrP   repadr  r   encoder_layerlayer_outputss         `  ``           r7   r8   zModernBertModel.forward  s	   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ"6BD$5b4! 66y.Q'/(&3&9&9"1&=#
G&/oobq&9#
G%.%:!!@T@T!"ZZW(=fTYT^T^_N;;++/BB:#5*:L ( I`#,^JF	7J
Q 
 Ja,^JFM7J
Q #$||GFCMMaP262M2M2C 3N 3/N/ )=Y![[ 	PM#$58H$H!)-$7)%%"3M *!,M S%7!%;&9]1=M<O&O#	P"   1]4D D62$gZPWM !,$) */* %!
 m]4EGZ$[mmm++*
 	
i s   >I--I6c                     |r| j                   j                  dk(  r't        j                  d       d| j                   _        nF| j                   j                  dk7  r-t        j                  d| j                   j                   d       t	        || j
                        }t        j                  |j                  d         j                  d      }t        j                  ||j                  z
        }|| j                   j                  dz  k  j                  d      j                  d      j                  |j                        }|j                  |j!                         t        j"                  | j
                        j$                        }||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r%   r   )rk   r  rX  rY  r   rQ   rE   r  r+   r   absTr   r   rP   masked_filllogical_notfinfomin)rW   r   r   global_attention_maskrowsdistancewindow_maskr   s           r7   r  z&ModernBertModel._update_attention_mask  sS   {{//69##V 4;011W<##  $ @ @A B:: !;>4:: V ||177:;EEaH99TDFF]+ 4499DDQGQQRSTWWXfXmXmn 	 4??@W@W@Y[`[f[fgkgqgq[r[v[vw$&999r9   NNNNNNNNNNNNN)rA   rB   rC   r   rU   r  r  r   r   rE   r   rF   rG   r  r   re   r   r8   r  rg   rh   s   @r7   r  r    s   	/ 	./  15156:3704*.-1$($(!%,0/3&*x
E,,-x
 !.x
 &ell3	x

 u//0x
  -x
 %,,'x
 U\\*x
 SMx
 SMx
 #x
 $D>x
 'tnx
 d^x
 
uU\\3&'8	9x
 x
t:U\\ :VZ :_d_k_k :r9   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )rA  rk   c                 J   t         |           || _        t        j                  |j
                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        y )Nrn   )rT   rU   rk   r   r   rs   classifier_biasrB  r   classifier_activationr   rv   rw   rx   ry   r}   s     r7   rU   z!ModernBertPredictionHead.__init__  sq    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r9   r   rY   c                 `    | j                  | j                  | j                  |                  S rb   )ry   r   rB  r#  s     r7   r8   z ModernBertPredictionHead.forward  s#    yy$**]";<==r9   )	rA   rB   rC   r   rU   rE   rF   r8   rg   rh   s   @r7   rA  rA    s-    a/ a>U\\ >ell >r9   rA  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc            "       8    e Zd ZdgZdef fdZd Zdej                  fdZ	 e
j                  d      d	e
j                  d
e
j                  fd       Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                      dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee   dee   dee   dee   dee   dee   d
eee
j                     ef   fd       Z xZS )rC  zdecoder.weightrk   c                 t   t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                  |j                        | _        | j                  j                  | _        | j                  j                  | _        | j                          y )Nr   )rT   rU   rk   r  r*  rA  headr   r   rs   rr   decoder_biasrD  sparse_predictionsparse_pred_ignore_indexr  r}   s     r7   rU   zModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	r9   c                     | j                   S rb   rD  r`   s    r7   get_output_embeddingsz+ModernBertForMaskedLM.get_output_embeddings  s    ||r9   new_embeddingsc                     || _         y rb   r  )rW   r  s     r7   set_output_embeddingsz+ModernBertForMaskedLM.set_output_embeddings  s	    %r9   Tr~   r~  rY   c                 B    | j                  | j                  |            S rb   )rD  r  )rW   r~  s     r7   compiled_headz#ModernBertForMaskedLM.compiled_head  s    ||DIIf-..r9   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  c                 L   ||n| j                   j                  }| j                          | j                   j                  dk(  r|||	|
)|'||j                  dd \  }
}n|j                  dd \  }
}||j
                  n|j
                  }|(t        j                  |
|f|t        j                        }|4t        j                         5  t        ||||      \  }}}}	}}ddd       nt        ||||      \  }}}}	}}| j                  ||||||||	|
||||      }|d   }| j                  rK|I|j                  d      }|j                  |j                  d   d      }|| j                  k7  }||   }||   }| j                   j                  r| j!                  |      n| j#                  | j%                  |            }d}|* | j&                  ||fd	| j                   j(                  i|}| j                   j                  dk(  rN| j                   j*                  s|
t-               nt        j                         5  t/        |||
|
      }ddd       |s|f}||f|z   S |S t1        |||j2                  |j4                        S # 1 sw Y   xY w# 1 sw Y   HxY w)r  Nr   r%   rS   )rf  r   r   rg  r   r   r   r   r   rs  r"   r#   r  r  r   r  r  r   r&   rr   r  losslogitsr   r  )rk   r  rZ  r  r+   rP   rE   r  r  r   r{  r*  r  r,   r  r   r  rD  r  loss_functionrr   repad_logits_with_gradr   r  r   r   r  )rW   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  rP   outputsr  mask_tokensr  r  r~  s                          r7   r8   zModernBertForMaskedLM.forward  s   F &1%<k$++B]B]!;;++/BB:#5*:L%'/$0.;.A.A"1.E+
G.7oobq.A+
G-6-B))H\H\!)%*ZZW0Ef\a\f\f%gN ( [r#,^Zfou\X	7J
LRX 
 \s,^Zfou\XM7J
LRX **) 3%'!!!/!5#  
 $AJ!!f&8[[_F 1 6 6v||A K !D$A$AAK 1+ >K(F {{,, 01dii(9:; 	 %4%%ffbAWAWb[abD;;++/BB"&++"D"D\a\i\i\k r/vwV`ipqr YF)-)9TGf$EvE!//))	
 	
m ^r rs   JJJJ#NNNNNNNNNNNNNN)rA   rB   rC   _tied_weights_keysr   rU   r  r   r   r  rE   r   rF   r  r   r   r   rG   r  r   re   r   r8   rg   rh   s   @r7   rC  rC    s    ++/ &BII & U]]4 /ELL /U\\ / !/  15156:/304)-*.-1$($(!%,0/3&*m
E,,-m
 !.m
 &ell3	m

 u||,m
  -m
 &m
 %,,'m
 U\\*m
 SMm
 SMm
 #m
 $D>m
 'tnm
 d^m
" 
uU\\"N2	3#m
 m
r9   rC  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   dee   dee   dee   deeej                     ef   fd       Z xZS )rE  rk   c                 n   t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j!                          y rb   )rT   rU   
num_labelsrk   r  r*  rA  r  rE   r   rz   classifier_dropoutr|   r   rs   rI  r  r}   s     r7   rU   z,ModernBertForSequenceClassification.__init__I  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r9   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  rY   c                 f   ||n| j                   j                  }| j                          | j                  ||||||||	|
||||      }|d   }| j                   j                  dk(  r
|dddf   }nQ| j                   j                  dk(  r8||j                  d      z  j                  d      |j                  dd	
      z  }| j                  |      }| j                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rd| j                   _
        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _
        nd| j                   _
        | j                   j                  dk(  rIt!               }| j                  dk(  r& ||j#                         |j#                               }n |||      }n| j                   j                  dk(  r=t%               } ||j'                  d| j                        |j'                  d            }n,| j                   j                  dk(  rt)               } |||      }|s|f}||f|z   S |S t+        |||j,                  |j.                        S )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr  r   clsr.  r&   r   r   TrN   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )rk   r  rZ  r*  classifier_poolingr   rj  r  r|   rI  problem_typer  rQ   rE   longrG   r	   squeezer   r,   r   r   r   r  )rW   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  r  r  pooled_outputr  r  loss_fctr~  s                          r7   r8   z+ModernBertForSequenceClassification.forwardV  s   N &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ;;))U2 1!Q$ 7[[++v5!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./YF)-)9TGf$EvE'!//))	
 	
r9   r  )rA   rB   rC   r   rU   r   r   rE   r   rF   rG   r  r   re   r   r8   rg   rh   s   @r7   rE  rE  C  sk   /   15156:/304)-*.-1$($(!%,0/3&*e
E,,-e
 !.e
 &ell3	e

 u||,e
  -e
 &e
 %,,'e
 U\\*e
 SMe
 SMe
 #e
 $D>e
 'tne
 d^e
" 
uU\\"$<<	=#e
 e
r9   rE  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   dee   dee   dee   deeej                     ef   fd       Z xZS )rG  rk   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rb   rT   rU   r  r  r*  rA  r  rE   r   rz   r  r|   r   rs   rI  r  r}   s     r7   rU   z)ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r9   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  rY   c                    ||n| j                   j                  }| j                          | j                  ||||||||	|
||||      }|d   }| j	                  |      }| j                  |      }| j                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr  r   r&   r   r  )rk   r  rZ  r*  r  r|   rI  r   r,   r  r   r   r  )rW   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  r  r  r  r  r~  s                        r7   r8   z(ModernBertForTokenClassification.forward  s#   H &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ II&78 II&78!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r9   r  )rA   rB   rC   r   rU   r   r   rE   r   rF   rG   r  r   re   r   r8   rg   rh   s   @r7   rG  rG    sk   
/ 
  15156:/304)-*.-1$($(!%,0/3&*I
E,,-I
 !.I
 &ell3	I

 u||,I
  -I
 &I
 %,,'I
 U\\*I
 SMI
 SMI
 #I
 $D>I
 'tnI
 d^I
  
uU\\"$99	:!I
 I
r9   rG  c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee	   dee	   dee	   dee
   dee
   dee
   deeej                     ef   fd       Z xZS )rH  rk   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rb   r  r}   s     r7   rU   z'ModernBertForQuestionAnswering.__init__   sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJr9   r   r   r   r   start_positionsend_positionsrs  r"   r#   r  r  r   r  r  rY   c                 T   ||n| j                   j                  }| j                          | j                  |||||||	|
||||      }|d   }| j	                  |      }| j                  |      }| j                  |      }|j                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}|| | j                  ||||fi |}|s||f|dd z   }||f|z   S |S t        ||||j                  |j                        S )r  N)r   r   r   rs  r"   r#   r  r  r   r  r  r   r   r&   r   )r  start_logits
end_logitsr   r  )rk   r  rZ  r*  r  r|   rI  splitr  r*   r  r   r   r  )rW   r   r   r   r   r  r  rs  r"   r#   r  r  r   r  r  r  r  r  r  r  r  r  r~  s                          r7   r8   z&ModernBertForQuestionAnswering.forward+  sh   F &1%<k$++B]B]!**) 3%!!!/!5#  
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD"J/'!"+=F)-)9TGf$EvE+%!!//))
 	
r9   r  )rA   rB   rC   r   rU   r   r   rE   rF   rG   r  r   re   r   r8   rg   rh   s   @r7   rH  rH    sf   	/ 	  266:/32604*.-1$($(!%,0/3&*K
ELL)K
 !.K
 &ell3	K

 u||,K
 "%,,/K
  -K
 %,,'K
 U\\*K
 SMK
 SMK
 #K
 $D>K
 'tnK
 d^K
" 
uU\\"$@@	A#K
 K
r9   rH  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c            "           e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   dee   dee   dee   deeej                     ef   fd       Z xZS )rF  rk   c                 8   t         |   |       || _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  d      | _        | j                          y Nr   )rT   rU   rk   r  r*  rA  r  rE   r   rz   r  r|   r   rs   rI  r  r}   s     r7   rU   z$ModernBertForMultipleChoice.__init__  so     $V,
,V4	HH$$V%>%>?	))F$6$6: 	r9   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  rY   c                 H   ||n| j                   j                  }||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                          | j                  ||||||||	|
||||      }|d   }| j                   j                  dk(  r
|dddf   }nQ| j                   j                  dk(  r8||j                  d      z  j                  d	      |j                  dd
      z  }| j                  |      }| j                  |      }| j                  |      }|j                  d|      }d}|t        j                         } |||      }|s|f|dd z   }||f|z   S |S t        |||j                   |j"                        S )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr   r&   r  r   r  r.  r   Tr  r  )rk   r  r+   r,   sizerZ  r*  r  r   rj  r  r|   rI  r   r   r   r   r  )rW   r   r   r   r   r   rg  rs  r"   r#   r  r  r   r  r  r  num_choicesr  r  r  r  reshaped_logitsr  r  r~  s                            r7   r8   z#ModernBertForMultipleChoice.forward  sa   L &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 	!**) 3%'!!!/!5#  
 $AJ;;))U2 1!Q$ 7[[++v5!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/ ++b+6**,HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r9   r  )rA   rB   rC   r   rU   r   r   rE   r   rF   rG   r  r   re   r   r8   rg   rh   s   @r7   rF  rF  z  sk   
/ 
  15156:/304)-*.-1$($(!%,0/3&*_
E,,-_
 !._
 &ell3	_

 u||,_
  -_
 &_
 %,,'_
 U\\*_
 SM_
 SM_
 #_
 $D>_
 'tn_
 d^_
" 
uU\\"$==	>#_
 _
r9   rF  )r  r)  rC  rE  rG  rH  rF  r@   r  r  )Wr  r>  
contextlibr   typingr   r   rE   torch.nn.functionalr   r   r   torch.nnr   r   r	   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   r   utils.import_utilsr   configuration_modernbertr   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr   flash_attn.ops.triton.rotaryr   object
get_loggerrA   rX  autogradFunctionr!   rF   rG   rK   rM   r<  rj   r   r   r   r   r   re   r  r   r   rQ   r   r   r  r   r  r)  r{  r  r  rA  rC  rE  rG  rH  rF  __all__rH   r9   r7   <module>r     s  ,   " "     A A ! B 9  L - G G 5 6 P89O 
		H	%46%..11 46v *. $L &	L
 L42Q 2Qj299 <:BII :(<		 <D(H )."!"	" LL" 	"
 5++," 38_" 	" 
"  ~" 5u||+,eELL.AAB"\ !&(!!(!	(! 2(! 	(!
 (! 38_(! 	(! 
(! ++(! 5<<(!V ! 	  LL  	 
 5++,  38_  	  
  5<< H 1$"! L3")) L3^+37 +3\ } } }F ,0%)	&mLL&mLL&m 5<<(&m U\\"	&m
 5<<u||S(5<<:PRZ[`[g[gRhhi&mRLL\\  	
 \\> j:/ j: j:Z	>ryy 	> 
H
5 H

H
V 
t
*C t

t
n 
W
'@ W

W
t X
%> X
 X
v 
m
"; m

m
`r9   