
    rh                     T   d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$  e"jJ                  e&      Z'd(dZ( G d dejR                        Z* G d dejR                        Z+ G d de      Z,e! G d de             Z-e! G d de-             Z. e!d       G d de-e             Z/ e!d        G d! d"e-             Z0e! G d# d$e-             Z1e! G d% d&e-             Z2g d'Z3y))zPyTorch MPT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfigc                 R   t        j                  d|z
  dt         j                  |      j                  ddd|      }dt	        j
                  t	        j                  |             z  }t        j                  d|dz   t         j                  |      j                         }|||z  z  }dt        j                  d|      z  }|j                  d|dd      }|| k7  r9t        j                  |ddddddf   |ddddddf   gd      ddd| df   }||z  }|j                  d      S )	a  
    Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr6   -   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D>$889D599Q%%F[[0!Q7Fy(vaAsl3VAssCK5HIqQRSU_V_U_adRdeFNE==    c                        e Zd ZdZddedee   f fdZ	 	 	 ddej                  dej                  dee
   deej                     d	eej                     f
d
Z xZS )MptAttentionzzMulti-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    config	layer_idxc                    t         |           |j                  | _        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  j                  | _        | j                  4dt        j                  | j                  | j                  z        z  | _        |j                  j                  | _        |j                  j                  | _        t        j                  | j                  d| j                  z  d      | _        t        j                  | j                  | j                  d      | _        || _        y )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler&   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr;   )selfr:   r;   	__class__s      r5   r@   zMptAttention.__init__I   s   !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Q"r7   hidden_statesposition_biaspast_key_valueattention_maskcache_positionc                 p   |j                   d d \  }}| j                  |      }| j                  r(|j                  | j                   | j                        }|j	                  dd      \  }	}
}|	j                  ||| j                  | j                        j                  dd      }	|
j                  ||| j                  | j                        j                  dd      }
|j                  ||| j                  | j                        j                  dd      }|%d|i}|j                  |
|| j                  |      \  }
}t        j                  |	|
j                  dd            | j                  z  }||n||j                         z   }|t        |j                         dk7  r!t!        d	t        |j                                |
j                   d   }t#        d
|j%                  d      |z
        }t#        d
|j%                  d      |z
        }|d d |d |d f   }||z   }|9|j'                  |t        j(                  |	j*                        j,                        }t.        j0                  j3                  |j5                         d      j7                  |j*                        }t.        j0                  j9                  || j:                  | j<                        }t        j                  ||      }|j?                  d
ddd      jA                         jC                  ||d      }| jE                  |      }||fS )Nr   )minmaxr   r    r   rU   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperM   rK   clampchunkreshaperB   rE   	transposeupdater;   r"   matmulrG   get_seq_lengthlen
ValueErrorrX   sizemasked_fillfinfor   rW   r   r
   softmaxr*   todropoutrJ   r]   permute
contiguousr%   rN   )rO   rQ   rR   rS   rT   rU   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                        r5   forwardzMptAttention.forwardY   s    "/!4!4Ra!8
JIIm,	==!T]]NNI1:1J.j,#++J
DLLRVR_R_`jjklnop''
Jdmm\ffghjkl
#++J
DLLRVR_R_`jjklnop%,n=L'5'<'<ZW[WeWegs't$J <<j6J6J2r6RSVZVhVhh%3%;znNkNkNmAm$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/-?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjZdfhimmN3L((r7   N)NNN)__name__
__module____qualname____doc__r   r   intr@   r"   Tensorr   r   __classcell__rP   s   @r5   r9   r9   D   s}    #y #Xc] #( +/15151)||1) ||1) !	1)
 !.1) !.1)r7   r9   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )MptMLPr:   c                 &   t         |           |j                  }t        j                  |d|z  d      | _        t        j                  d      | _        t        j                  d|z  |d      | _        |j                  j                  | _        y )N   Fr=   none)approximate)r?   r@   rA   r   rL   up_projGELUact	down_projrF   rI   hidden_dropout)rO   r:   rA   rP   s      r5   r@   zMptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r7   rQ   residualreturnc                     | j                  | j                  |            }| j                  |      }t        j                  || j
                  | j                        }||z   }|S )Nr[   )r   r   r   Frm   r   r]   )rO   rQ   r   intermediate_outputoutputs        r5   r   zMptMLP.forward   sW    m!<="nn];.$2E2EPTP]P]^("r7   )	r   r   r   r   r@   r"   r   r   r   r   s   @r5   r   r      s5    <y <U\\ U\\ ell r7   r   c                        e Zd Zddedee   f fdZ	 	 	 	 ddej                  dej                  dej                  dee	   de
d	e
d
eej                     fdZ xZS )MptBlockr:   r;   c                    t         |           |j                  }t        ||j                        | _        d | j
                  _        |j                  | _        t        ||      | _
        t        ||j                        | _        d | j                  _        t        |      | _        |j                  j                  | _        t#        j$                  | j                         | _        y )Neps)r?   r@   rA   r   layer_norm_epsilonnorm_1r>   rB   r.   r9   attnnorm_2r   ffnrF   rI   dropout_rater   Dropoutresid_attn_dropout)rO   r:   r;   rA   rP   s       r5   r@   zMptBlock.__init__   s    ((1J1JK 3	1J1JK&>"..99"$**T->->"?r7   rQ   rR   rT   
layer_past	use_cacheoutput_attentionsrU   c                     | j                  |      }|}	| j                  |||||      \  }
}| j                  |
      |	z   }| j                  |      }|}	| j	                  ||	      }||fS )N)rR   rT   rS   rU   )r   r   r   r   r   )rO   rQ   rR   rT   r   r   r   rU   layernorm_outputr   attn_outputsr|   r   s                r5   r   zMptBlock.forward   s      ;;}5  &*YY')%) &/ &
"l //=H;;}5 ! *H5|##r7   r   )NFFN)r   r   r   r   r   r   r@   r"   r   r   boolr   r   r   s   @r5   r   r      s    @y @Xc] @2 '+"'15"$||"$ ||"$ 	"$
 UO"$ "$  "$ !."$r7   r   c                        e Zd ZU eed<   dZdZdgZdgZ fdZ	de
j                  fdZed	eeej                   ej                   f      d
eeej                   ej                   f      fd       Z xZS )MptPreTrainedModelr:   transformerTr   z
lm_head.*.c                 $    t        |   |i | y r   )r?   r@   )rO   inputskwargsrP   s      r5   r@   zMptPreTrainedModel.__init__   s    &+F+r7   modulec                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t              rV|j                  $|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weights.g        )meanstdNr   )
isinstancer   rL   weightdatanormal_r:   initializer_ranger>   zero_	Embeddingpadding_idxr   fill_)rO   r   s     r5   _init_weightsz MptPreTrainedModel._init_weights   s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .	*{{&  &&(MM$$S) +r7   rS   r   c                 l    | d   d   j                   \  }}||z  t        fd| D              S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c              3   v   K   | ]0  }|d    j                        |d   j                        f 2 yw)r   r   N)ra   ).0r   batch_size_times_num_headsrE   rq   s     r5   	<genexpr>z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>  sK      

  1%%&@(JW1%%&@*hW
s   69)r^   tuple)rS   rp   r.   r   rE   rq   s      @@@r5   _convert_to_mpt_cachez(MptPreTrainedModel._convert_to_mpt_cache   sM     7EQ6G6J6P6P3
Ix%/)%;"  

 -
 
 	
r7   )r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr@   r   Moduler   staticmethodr   r"   r   r   r   r   s   @r5   r   r      s    %&*##'4o#,*BII *" 
eELL%,,$>?@
	uU\\5<</0	1
 
r7   r   c                       e Zd Zdef fdZd ZddZdej                  fdZ	e
	 	 	 	 	 	 	 	 	 ddeej                     deeeeej                  ej                  f   d	f   ef      d
eej                     deej                     dee   dee   dee   dee   deej                     deeej                  d	f   ef   fd       Z xZS )MptModelr:   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        | j                  |j                        | _        d | j                   _        d| _        | j'                          y c c}w )N)r;   r   F)r?   r@   rA   rB   r.   r   r   
vocab_sizewte
ModuleListrangen_layersr   blocksr   r   norm_fr>   gradient_checkpointing	post_init)rO   r:   irP   s      r5   r@   zMptModel.__init__  s     !-- << 1 143C3CD mmERXRaRaLb$cqXf%B$cd   0 0f6O6OP&+# 	 %ds   C7c                     | j                   S r   r   )rO   s    r5   get_input_embeddingszMptModel.get_input_embeddings$  s    xxr7   c                     t        ||||      S r   )r6   )rO   r.   r/   r0   r   s        r5   r6   zMptModel.build_mpt_alibi_tensor'  s    %i.RXYYr7   new_embeddingsc                     || _         y r   r   rO   r   s     r5   set_input_embeddingszMptModel.set_input_embeddings*  s	    !r7   	input_idspast_key_values.rT   inputs_embedsr   r   output_hidden_statesreturn_dictrU   r   c
           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t        d      ||j                  \  }}n||j                  \  }}}nt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }d}|r<t        |t              s,d}t        j                  d       t        j                  |      }|}|rdnd}|rdnd}||j!                         nd	}||z   }|$t#        j$                  ||f|j&                  
      }n|j)                  |j&                        }| j+                  | j,                  | j                   j.                  |j&                  
      }t1        |||f||      }|j3                         }| j4                  D ]*  }|r||fz   } ||||||||	      }|d	   }|s"||d   fz   }, | j7                  |      }|r|j9                         }|r||fz   }|st;        d ||||fD              S t=        ||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`. r   r   )r   rT   r   r   rR   rU   r   c              3   &   K   | ]	  }||  y wr   r   )r   vs     r5   r   z#MptModel.forward.<locals>.<genexpr>  s      ghgts   )last_hidden_stater   rQ   
attentions)r:   r   r   r   use_return_dictrg   r^   r   r]   loggerwarning_oncer   r   r   r   from_legacy_cachere   r"   onesr   rl   r6   r.   rC   r   r   r   r   to_legacy_cacher   r   )rO   r   r   rT   r   r   r   r   r   rU   r   rp   rq   _return_legacy_cacherQ   all_self_attentionsall_hidden_statespast_key_values_lengthseq_length_with_pastr1   causal_maskblockoutputss                           r5   r   zMptModel.forward-  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%J
ATUU&&4==##p "	  HHY/M#Z?"&U
 +<<_MO%$5b4"6BD FUE`!?!?!Afg),BB!"ZZ5I(JS`SgSghN+..}/C/CDN++DNNDKK<S<S\i\p\p+q7Z4mE[
 "&&([[ 	JE#$58H$H!**#"3#-G $AJM &9WQZM&I#!	J& M2-==?O 1]4D D )?<MObc   9+++*	
 	
r7      N	NNNNNNNNN)r   r   r   r   r@   r   r6   r"   r   r   r   r   
LongTensorr   r   r   r   r   r   r   r   s   @r5   r   r     sG   y ,Z"5<< "  15ae1548$(,0/3&*15w
E,,-w
 "%eELL%,,4N.OQT.T(UW\(\"]^w
 !.	w

   0 01w
 D>w
 $D>w
 'tnw
 d^w
 !.w
 
uU\\3&')RR	Sw
 w
r7   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZdgZdef fdZdej                  fdZe		 	 	 	 	 	 	 	 	 	 dde
ej                     de
eeej                  ej                  f   df      d	e
ej                     d
e
ej                     de
ej                     de
e   de
e   de
e   de
e   de
ej                     deeej                     ef   fd       Z xZS )MptForCausalLMzlm_head.weightr:   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr=   )
r?   r@   r   r   r   rL   rA   r   lm_headr   rO   r:   rP   s     r5   r@   zMptForCausalLM.__init__  sI     #F+yy!3!3V5F5FUS 	r7   r   c                     || _         y r   )r
  r   s     r5   set_output_embeddingsz$MptForCausalLM.set_output_embeddings  s	    %r7   r   r   .rT   r   labelsr   r   r   r   rU   r   c                    |	|	n| j                   j                  }	| j                  ||||||||	|
	      }|d   }| j                  |      }d}|E|j	                  |j
                        } | j                  ||fd| j                   j                  i|}|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   rT   r   r   r   r   r   rU   r   r   r   losslogitsr   rQ   r   )r:   r   r   r
  rl   r   loss_functionr   r   r   rQ   r   )rO   r   r   rT   r   r  r   r   r   r   rU   r   transformer_outputsrQ   	lm_logitsr  r   s                    r5   r   zMptForCausalLM.forward  s   @ &1%<k$++B]B]"..+)'/!5#) / 

 ,A.LL/	YYy//0F%4%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r7   )
NNNNNNNNNN)r   r   r   _tied_weights_keysr   r@   r"   r   r  r   r   r  r   r   r   r   r   r   r   s   @r5   r  r    sK    ++y &ELL &  15SW1504)-$(,0/3&*15F
E,,-F
 "%ellELL.H(I3(N"OPF
 !.	F

  -F
 &F
 D>F
 $D>F
 'tnF
 d^F
 !.F
 
uU\\"$EE	FF
 F
r7   r  a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   X    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                  ej                  f   df      deej                     deej                     deej                     d	ee   d
ee   dee   dee   dee	ej                     ef   fd       Z xZS )MptForSequenceClassificationr:   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r	  )
r?   r@   
num_labelsr   r   r   rL   rA   scorer   r  s     r5   r@   z%MptForSequenceClassification.__init__  sV      ++#F+YYv1163D3D5Q
 	r7   r   r   .rT   r   r  r   r   r   r   r   c
           
      r   |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|^| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j"                  dk(  rt1               } |||      }n,| j                   j"                  dk(  rt3               } |||      }|	s|f|
dd z   }||f|z   S |S t5        |||
j6                  |
j8                  |
j:                        S )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rT   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rY   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r:   r   r   r  r^   pad_token_idrg   rl   r   r"   r$   r#   argmaxr   r   rP   r   problem_typer  r   longr   r	   r-   r   r   r   r   rQ   r   )rO   r   r   rT   r   r  r   r   r   r   r  rQ   r  rp   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr   s                        r5   r   z$MptForSequenceClassification.forward  s   < &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r7   r  )r   r   r   r   r@   r   r   r"   r  r   r   r   r   r   r   r   r   s   @r5   r  r    s   y   15SW1504)-$(,0/3&*d
E,,-d
 "%ellELL.H(I3(N"OPd
 !.	d

  -d
 &d
 D>d
 $D>d
 'tnd
 d^d
 
uU\\"$DD	Ed
 d
r7   r  c                   X    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                  ej                  f   df      deej                     deej                     deej                     d	ee   d
ee   dee   dee   dee	ej                     ef   fd       Z xZS )MptForTokenClassificationr:   c                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)r?   r@   r  r   r   hasattrr.  r   r   r   rm   rL   rA   
classifierr   )rO   r:   r.  rP   s      r5   r@   z"MptForTokenClassification.__init__  s      ++#F+6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r7   r   r   .rT   r   r  r   r   r   r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|l|j                  |j                        }|j                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r  Nr  r   r   )r  r  rQ   r   )r:   r   r   rm   r0  rl   r   r^   r   r%   r  r   rQ   r   )rO   r   r   rT   r   r  r   r   r   r   deprecated_argumentsr  rQ   r  r  rp   rq   r*  r   s                      r5   r   z!MptForTokenClassification.forward  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r7   r  )r   r   r   r   r@   r   r   r"   r  r   r   r   r   r   r   r   r   s   @r5   r,  r,    s   y "  15SW1504)-$(,0/3&*B
E,,-B
 "%ellELL.H(I3(N"OPB
 !.	B

  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\\"$99	:B
 B
r7   r,  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     dee	   dee	   d	ee	   d
e
eef   fd       Z xZS )MptForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr   )	r?   r@   r   r   r   rL   rA   
qa_outputsr   r  s     r5   r@   z MptForQuestionAnswering.__init__  sA     #F+))F$6$6: 	r7   r   rT   r   start_positionsend_positionsr   r   r   r   c	                 "   ||n| j                   j                  }| j                  ||||||      }	|	d   }
| j                  |
      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|	dd z   }||f|z   S |S t        ||||	j                  |	j                  	      S )
r   N)rT   r   r   r   r   r   r   rY   r    )ignore_indexr   )r  start_logits
end_logitsrQ   r   )r:   r   r   r6  splitr-   ro   rf   rh   r_   r   r   rQ   r   )rO   r   rT   r   r7  r8  r   r   r   r   sequence_outputr  r;  r<  
total_lossignored_indexr*  
start_lossend_lossr   s                       r5   r   zMptForQuestionAnswering.forward  s   2 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r7   )NNNNNNNN)r   r   r   r@   r   r   r"   r  FloatTensorr   r   r   r   r   r   r   s   @r5   r4  r4    s      156:596:48,0/3&*E
E,,-E
 !!2!23E
   1 12	E

 "%"2"23E
   0 01E
 $D>E
 'tnE
 d^E
 
u22	3E
 E
r7   r4  )r  r   r   r  r,  r4  r  )4r   r&   typingr   r   r"   torch.utils.checkpointr   torch.nnr   r   r   r	   r
   r   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerr   r   r6   r   r9   r   r   r   r   r  r  r,  r4  __all__r   r7   r5   <module>rQ     sf     "    L L $ . ) I 9  . , ( 
		H	%.F)299 F)RRYY *7$) 7$t ,
 ,
 ,
^ X
! X
 X
v U
' U
U
p o
#5 o
o
d U
 2 U
 U
p O
0 O
 O
dr7   