
    rh2                    v   d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl
Z	ddlmZ ddl	mZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4  e2jj                  e6      Z7d Z8 G d dejr                        Z: G d dejr                        Z; G d de;      Z< G d dejr                        Z=e;e<dZ> G d dejr                        Z? G d  d!ejr                        Z@ G d" d#ejr                        ZA G d$ d%e      ZB G d& d'ejr                        ZC G d( d)ejr                        ZD G d* d+ejr                        ZE G d, d-ejr                        ZF G d. d/ejr                        ZG G d0 d1ejr                        ZH G d2 d3ejr                        ZIe0 G d4 d5e)             ZJe e0d67       G d8 d9e/                    ZK e0d:7       G d; d<eJ             ZL e0d=7       G d> d?eJ             ZM e0d@7       G dA dBeJe             ZNe0 G dC dDeJ             ZO e0dE7       G dF dGeJ             ZP e0dH7       G dI dJeJ             ZQe0 G dK dLeJ             ZRe0 G dM dNeJ             ZSe0 G dO dPeJ             ZTg dQZUy)RzPyTorch BERT model.    N)	dataclass)OptionalUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringget_torch_versionlogging   )
BertConfigc           	      L   	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]^  \  }
}t        j                  d|
 d|        |j                  j                  ||
      }|j                  |
       |	j                  |       ` t        ||	      D ]  \  }
}|
j                  d      }
t!        d |
D              r(t        j                  d	dj#                  |
              R| }|
D ]  }|j%                  d
|      r|j                  d|      }n|g}|d   dk(  s|d   dk(  rt'        |d      }nW|d   dk(  s|d   dk(  rt'        |d      }n:|d   dk(  rt'        |d      }n%|d   dk(  rt'        |d      }n	 t'        ||d         }t+        |      dk\  st-        |d         }||   } dd dk(  rt'        |d      }n|dk(  r|j/                  |      }	 |j0                  |j0                  k7  r&t3        d|j0                   d|j0                   d      	 t        j                  d|
        t7        j8                  |      |_         | S # t        $ r t        j                  d        w xY w# t(        $ r+ t        j                  d	dj#                  |
              Y w xY w# t2        $ r1}|xj4                  |j0                  |j0                  fz  c_         d}~ww xY w)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py	<genexpr>z*load_tf_weights_in_bert.<locals>.<genexpr>T   s      
 nn
   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r$   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr?   nptftf_path	init_varsnamesarraysnamerW   arraypointerm_namescope_namesnumes                     r1   load_tf_weights_in_bertrm   7   s,   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	 5&) ,/ezz#  

 
 KK)CHHTN#345 	'F||,f5 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g{1~>G ;1$+a.)!#,+	', #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.Y,/Z L  Q	
 	Z & KK)CHHTN+; <=  	FFw}}ekk22F	s5   J 9J2?K) J/20K&%K&)	L#2,LL#c                        e Zd ZdZ fdZ	 	 	 	 	 d
deej                     deej                     deej                     deej                     de	dej                  fd	Z xZS )BertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r$   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrR   rt   register_bufferrZ   arangeexpandzerosrv   sizelongselfr^   	__class__s     r1   r}   zBertEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsry   rv   inputs_embedspast_key_values_lengthreturnc                 Z   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }	|	}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j                  |      }| j                  |      }|S )Nrw   r$   ry   r   r{   deviceru   )r   rv   hasattrry   r   rZ   r   r   r   r   r   rt   r   r   r   )r   r   ry   rv   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r1   forwardzBertEmbeddings.forward   sH     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r   )NNNNr   )__name__
__module____qualname____doc__r}   r   rZ   
LongTensorFloatTensorrU   Tensorr   __classcell__r   s   @r1   ro   ro      s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
'r   ro   c                        e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     dee   dee	   deej
                     d	e
ej
                     fd
Z xZS )BertSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        || _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rt   ru   relative_keyrelative_key_queryr=   r$   )r|   r}   r   num_attention_headsr   rX   rU   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rR   rt   r   r~   distance_embedding
is_decoder	layer_idxr   r^   rt   r   r   s       r1   r}   zBertSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"r   hidden_statesattention_mask	head_maskencoder_hidden_statespast_key_valueoutput_attentionscache_positionr   c                 	   |j                   \  }}	}
| j                  |      }|j                  |d| j                  | j                        j                  dd      }|d u}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }|j                  |d| j                  | j                        j                  dd      }| j#                  |      }|j                  |d| j                  | j                        j                  dd      }|D|s|nd }j%                  ||| j                  d|i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }| j*                  dk(  s| j*                  dk(  r|j                   d   |j                   d   }}|Dt'        j,                  |dz
  t&        j.                  |j0                  	      j                  dd      }n@t'        j2                  |t&        j.                  |j0                  	      j                  dd      }t'        j2                  |t&        j.                  |j0                  	      j                  dd      }||z
  }| j5                  || j6                  z   dz
        }|j9                  |j:                  
      }| j*                  dk(  rt'        j<                  d||      }||z   }nE| j*                  dk(  r6t'        j<                  d||      }t'        j<                  d||      }||z   |z   }|t?        j@                  | j                        z  }|||z   }tB        jD                  jG                  |d      }| jI                  |      }|||z  }t'        j(                  ||      }|jK                  dddd      jM                         }|jO                         d d | jP                  fz   }|j                  |      }||fS )Nrw   r$   r=   r   Tr   r   r   rz   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   ))rW   r   viewr   r   rV   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updaterZ   matmulrt   tensorr   r   r   r   r   tor{   einsummathsqrtr   
functionalsoftmaxr   permute
contiguousr   r   )r   r   r   r   r   r   r   r   
batch_sizer   _query_layeris_cross_attentionr   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r1   r   zBertSelfAttention.forward   sN    %2$7$7!
Jjj/!&&z2t7O7OQUQiQijttq
 3$>%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn= !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r   NNNNNNFNr   r   r   r}   rZ   r   r   r   r   booltupler   r   r   s   @r1   r   r      s    #< 7;15=A*.,115d.||d. !!2!23d. E--.	d.
  ((9(9:d. !d. $D>d. !.d. 
u||	d.r   r   c                        e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej
                     deej                     deej                     dee   dee	   deej
                     d	e
ej
                     f fd
Z xZS )BertSdpaSelfAttentionc                     t         |   |||       |j                  | _        t	        j
                  t                     t	        j
                  d      k  | _        y )Nrt   r   z2.2.0)r|   r}   r   dropout_probr   parser"   require_contiguous_qkvr   s       r1   r}   zBertSdpaSelfAttention.__init__D  sK    9P\ef"??&-mm4E4G&H7==Y`Ka&a#r   r   r   r   r   r   r   r   r   c           	         | j                   dk7  s|s|*t        j                  d       t        |   |||||||      S |j                         \  }}	}
| j                  |      j                  |d| j                  | j                        j                  dd      }|d u}|r|n|}|St        |t              rA|j                  j                  | j                        }|r|j                   }n|j"                  }n|}|r|n|}|rK|IrGj$                  | j                     j&                  }|j$                  | j                     j(                  }n| j+                  |      j                  |d| j                  | j                        j                  dd      }| j-                  |      j                  |d| j                  | j                        j                  dd      }|D|s|nd }j/                  ||| j                  d|i      \  }}|rd|j                  | j                  <   | j0                  rK|j2                  j4                  dk(  r2|0|j7                         }|j7                         }|j7                         }| j8                  xr | xr |d u xr |	dkD  }t:        j<                  j>                  jA                  ||||| jB                  r| jD                  nd	|
      }|j                  dd      }|jG                  ||	| jH                        }|d fS )Nru   a  BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rw   r$   r=   r   Tcuda        )	attn_mask	dropout_p	is_causal)%rt   rC   warning_oncer|   r   r   r   r   r   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   typer   r   rZ   r   r   scaled_dot_product_attentiontrainingr   reshaper   )r   r   r   r   r   r   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r  attn_outputr   s                       r1   r   zBertSdpaSelfAttention.forwardJ  s    '':59JiNcH 7?%!  (,,.Wa JJ}%**3D4L4LdNfNfgqqrsuvw 	 3$>2D.-%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK (c2t779Q9QR1a  

>*c2t779Q9QR1a  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn=
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OOi,>(>i>UYCYi^ehi^i	hh))FF$+/==d''c G 
 "++Aq1!))#w8J8JKD  r   r   r   r   r   s   @r1   r   r   C  s    b 2615=A*.,115e!||e! !.e! E--.	e!
  ((9(9:e! !e! $D>e! !.e! 
u||	e! e!r   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nrr   )r|   r}   r   r   r   denser   r   r   r   r   r   s     r1   r}   zBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr  r   r   r   r   r  s      r1   r   zBertSelfOutput.forward  7    

=1]3}|'CDr   r   r   r   r}   rZ   r   r   r   r   s   @r1   r  r    1    >U\\  RWR^R^ r   r  )eagersdpac                        e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     dee	   dee
   d	eej                     d
eej                     fdZ xZS )BertAttentionc                     t         |           t        |j                     |||      | _        t        |      | _        t               | _        y )Nr   )	r|   r}   BERT_SELF_ATTENTION_CLASSES_attn_implementationr   r  outputsetpruned_headsr   s       r1   r}   zBertAttention.__init__  sF    /0K0KL$;
	
 %V,Er   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r$   r   )rT   r   r   r   r   r$  r   r   r   r   r"  r  r   union)r   headsindexs      r1   prune_headszBertAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      r    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   r   r   r   r   r   r$   )r   r"  )r   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              r1   r   zBertAttention.forward  s\     yy)"7)/) ! 
  ;;|AF#%QR(88r   r   r   )r   r   r   r}   r)  rZ   r   r   r   r   r   r   r   r   r   s   @r1   r  r    s    ";* 7;15=A*.,115|| !!2!23 E--.	
  ((9(9: ! $D> !. 
u||	r   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r  )r|   r}   r   r   r   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r1   r}   zBertIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    | j                  |      }| j                  |      }|S r  )r  r5  r   r   s     r1   r   zBertIntermediate.forward  s&    

=100?r   r  r   s   @r1   r0  r0    s#    9U\\ ell r   r0  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
BertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r  )r|   r}   r   r   r2  r   r  r   r   r   r   r   r   s     r1   r}   zBertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r  r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r  r  r  s      r1   r   zBertOutput.forward  r  r   r  r   s   @r1   r9  r9  
  r  r   r9  c                       e Zd Zd fd	Z	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee   dee	   d	eej
                     d
e
ej
                     fdZd Z xZS )	BertLayerc                 l   t         |           |j                  | _        d| _        t	        ||      | _        |j                  | _        |j                  | _        | j                  r-| j                  st        |  d      t	        |d|      | _	        t        |      | _        t        |      | _        y )Nr$   r   z> should be used as a decoder model if cross attention is addedru   r   )r|   r}   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionrX   crossattentionr0  intermediater9  r"  )r   r^   r   r   s      r1   r}   zBertLayer.__init__  s    '-'E'E$&vC ++#)#=#= ##?? D6)g!hii"/PZfo"pD,V4 (r   r   r   r   r   encoder_attention_maskr   r   r   r   c	           	      H   | j                  ||||||      }	|	d   }
|	dd  }| j                  rB|@t        | d      st        d|  d      | j	                  |
||||||      }|d   }
||dd  z   }t        | j                  | j                  | j                  |
      }|f|z   }|S )N)r   r   r   r   r   r   r$   rD  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r+  )	rB  r   r   rX   rD  r   feed_forward_chunkr@  rA  )r   r   r   r   r   rF  r   r   r   self_attention_outputsr-  r.  cross_attention_outputslayer_outputs                 r1   r   zBertLayer.forward'  s    "&)/)) "0 "
 2!4(,??4@4!12 =dV DD D 
 '+&9&9 5#&;-"3- ': '#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+r   c                 L    | j                  |      }| j                  ||      }|S r  )rE  r"  )r   r-  intermediate_outputrK  s       r1   rH  zBertLayer.feed_forward_chunkW  s,    "//0@A{{#68HIr   r  )NNNNNFN)r   r   r   r}   rZ   r   r   r   r   r   r   r   rH  r   r   s   @r1   r=  r=    s    )" 7;15=A>B*.,115.||. !!2!23. E--.	.
  ((9(9:. !)):): ;. !. $D>. !.. 
u||	.`r   r=  c                   f    e Zd Zd fd	Z	 	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   deej
                     de
eej
                     ef   fdZ xZS )BertEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )Nr?  F)
r|   r}   r^   r   
ModuleListrangenum_hidden_layersr=  layergradient_checkpointing)r   r^   r   ir   s       r1   r}   zBertEncoder.__init__^  sQ    ]]ERXRjRjLk#lqIf$B#lm
&+# $ms   A%r   r   r   r   rF  past_key_values	use_cacher   output_hidden_statesreturn_dictr   r   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}d}|rR| j                   j                  r<t        |t              s,t        j                  d       d}t        j                  |      }t        | j                        D ]W  \  }}|	r||fz   }|||   nd } |||||||||      }|d   }|s/||d   fz   }| j                   j                  sO||d	   fz   }Y |	r||fz   }|r|j                         }|
st        d
 |||||fD              S t        |||||      S )Nr.   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.T)rF  r   r   r   r   r$   r=   c              3   $   K   | ]  }|| 
 y wr  r.   )r/   vs     r1   r2   z&BertEncoder.forward.<locals>.<genexpr>  s      
 = 
r3   )last_hidden_staterW  r   
attentionscross_attentions)r^   rC  rU  r	  rC   r  r   r   r   r   from_legacy_cache	enumeraterT  to_legacy_cacher   r   )r   r   r   r   r   rF  rW  rX  r   rY  rZ  r   all_hidden_statesall_self_attentionsall_cross_attentionsreturn_legacy_cacherV  layer_modulelayer_head_masklayer_outputss                       r1   r   zBertEncoder.forwardd  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#//
?TY8Z\
 #'1CCOTO(4 	VOA|#$58H$H!.7.CilO(%'=."3-	M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(+	V.   1]4D D-==?O 
 "#%'(
 
 
 9+++*1
 	
r   r  )
NNNNNNFFTN)r   r   r   r}   rZ   r   r   r   r   r   r   r   r   r   r   s   @r1   rO  rO  ]  s"   , 7;15=A>BEI$(,1/4&*15R
||R
 !!2!23R
 E--.	R

  ((9(9:R
 !)):): ;R
 "%e.?.?(@"ABR
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
uU\\"$MM	NR
r   rO  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
BertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r  )r|   r}   r   r   r   r  Tanh
activationr   s     r1   r}   zBertPooler.__init__  s9    YYv1163E3EF
'')r   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r  ro  )r   r   first_token_tensorpooled_outputs       r1   r   zBertPooler.forward  s6     +1a40

#566r   r  r   s   @r1   rl  rl    s#    $
U\\ ell r   rl  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r  )r|   r}   r   r   r   r  r   r3  r4  r   transform_act_fnr   r   r   s     r1   r}   z$BertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r  )r  rv  r   r7  s     r1   r   z#BertPredictionHeadTransform.forward  s4    

=1--m<}5r   r  r   s   @r1   rt  rt    s$    UU\\ ell r   rt  c                   *     e Zd Z fdZd Zd Z xZS )BertLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)r9   )r|   r}   rt  	transformr   r   r   r   decoder	ParameterrZ   r   r9   r   s     r1   r}   zBertLMPredictionHead.__init__  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr   c                 :    | j                   | j                  _         y r  )r9   r|  r   s    r1   _tie_weightsz!BertLMPredictionHead._tie_weights  s     IIr   c                 J    | j                  |      }| j                  |      }|S r  )r{  r|  r7  s     r1   r   zBertLMPredictionHead.forward  s$    }5]3r   )r   r   r   r}   r  r   r   r   s   @r1   ry  ry    s    &&r   ry  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertOnlyMLMHeadc                 B    t         |           t        |      | _        y r  )r|   r}   ry  predictionsr   s     r1   r}   zBertOnlyMLMHead.__init__  s    /7r   sequence_outputr   c                 (    | j                  |      }|S r  )r  )r   r  prediction_scoress      r1   r   zBertOnlyMLMHead.forward  s     ,,_=  r   r  r   s   @r1   r  r    s#    8!u|| ! !r   r  c                   $     e Zd Z fdZd Z xZS )BertOnlyNSPHeadc                 l    t         |           t        j                  |j                  d      | _        y Nr=   )r|   r}   r   r   r   seq_relationshipr   s     r1   r}   zBertOnlyNSPHead.__init__  s'     "		&*<*<a @r   c                 (    | j                  |      }|S r  )r  )r   rr  seq_relationship_scores      r1   r   zBertOnlyNSPHead.forward  s    !%!6!6}!E%%r   r   r   r   r}   r   r   r   s   @r1   r  r    s    A&r   r  c                   $     e Zd Z fdZd Z xZS )BertPreTrainingHeadsc                     t         |           t        |      | _        t	        j
                  |j                  d      | _        y r  )r|   r}   ry  r  r   r   r   r  r   s     r1   r}   zBertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @r   c                 N    | j                  |      }| j                  |      }||fS r  )r  r  )r   r  rr  r  r  s        r1   r   zBertPreTrainingHeads.forward
  s0     ,,_=!%!6!6}!E "888r   r  r   s   @r1   r  r    s    A
9r   r  c                   .    e Zd ZU eed<   eZdZdZdZ	d Z
y)BertPreTrainedModelr^   bertTc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsr  )meanstdNg      ?)r   r   r   r6   r\   normal_r^   initializer_ranger9   zero_r~   rq   r   fill_ry  )r   modules     r1   _init_weightsz!BertPreTrainedModel._init_weights  s'   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) 45KK""$ 6r   N)r   r   r   r%   __annotations__rm   load_tf_weightsbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  r.   r   r1   r  r    s#    -O&*#N%r   r  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)BertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r_  )r   r   r   r   r  r   rZ   r   r  r  r  r   r   r_  r.   r   r1   r  r  +  s~    	 )-D(5$$
%,59x 1 129;?Xe&7&78?8<M8E%"3"345<59Ju00129r   r  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c            "           e Zd ZddgZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	ee
j                        de	e   de	e   de	e   de	e   de	e
j                     deee
j                     ef   fd       Z xZS )	BertModelro   r=  c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        |j                  | _
        |j                  | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r|   r}   r^   ro   r   rO  encoderrl  poolerr!  attn_implementationrt   	post_init)r   r^   add_pooling_layerr   s      r1   r}   zBertModel.__init__S  si    
 	 (0"6*,=j(4#)#>#> '-'E'E$ 	r   c                 .    | j                   j                  S r  r   r   r  s    r1   get_input_embeddingszBertModel.get_input_embeddingsf  s    ...r   c                 &    || j                   _        y r  r  )r   r   s     r1   set_input_embeddingszBertModel.set_input_embeddingsi  s    */'r   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rT  rB  r)  )r   heads_to_prunerT  r'  s       r1   _prune_headszBertModel._prune_headsl  sE    
 +002 	CLE5LLu%//;;EB	Cr   r   r   ry   rv   r   r   r   rF  rW  rX  r   rY  rZ  r   r   c                 6   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }d}|	5t        |	t              s|	d   d   j                  d   n|	j                         }|pt        | j                  d      r4| j                  j                   d d d |f   }|j#                  ||      }|}n&t%        j&                  |t$        j(                  |      }| j                  |||||	      }|t%        j*                  |||z   f|
      }| j,                  dk(  xr | j.                  dk(  xr	 |d u xr | }|rQ|j1                         dk(  r>| j                   j                  rt3        ||||      }n+t5        ||j6                  |      }n| j9                  ||      }| j                   j                  rs|q|j                         \  }}}||f}|t%        j*                  ||
      }|r,|j1                         dk(  rt5        ||j6                  |      }n| j;                  |      }nd }| j=                  || j                   j>                        }| jA                  ||||||	|
||||      }|d   }| jB                  | jC                  |      nd } |s
|| f|dd  z   S tE        || |jF                  |jH                  |jJ                  |jL                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerw   z5You have to specify either input_ids or inputs_embedsr   r   ry   r   )r   rv   ry   r   r   )r   r  ru   r=   )r  )
r   r   r   rF  rW  rX  r   rY  rZ  r   r$   )r^  pooler_outputrW  r   r_  r`  )'r^   r   rY  use_return_dictr   rX  rX   %warn_if_padding_and_no_attention_maskr   r   r   r   rW   get_seq_lengthr   r   ry   r   rZ   r   r   onesr  rt   r   r   r   r{   get_extended_attention_maskinvert_attention_maskget_head_maskrS  r  r  r   rW  r   r_  r`  )!r   r   r   ry   rv   r   r   r   rF  rW  rX  r   rY  rZ  r   r   r   r   r   r   r   r   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputsr  rr  s!                                    r1   r   zBertModel.forwardt  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/59  "1%++B/$335 # !t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~Wb&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y$++2O2OP	,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )T)NNNNNNNNNNNNNN)r   r   r   _no_split_modulesr}   r  r  r  r!   r   rZ   r   listr   r   r   r   r   r   r   r   s   @r1   r  r  D  s    *;7&/0C  -11515/3,0048<9==A$(,0/3&*15S
ELL)S
 !.S
 !.	S

 u||,S
 ELL)S
  -S
  (5S
 !) 6S
 "$u'8'8"9:S
 D>S
 $D>S
 'tnS
 d^S
 !.S
  
uU\\"$PP	Q!S
 S
r   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )BertForPreTrainingpredictions.decoder.biascls.predictions.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r  )r|   r}   r  r  r  clsr  r   s     r1   r}   zBertForPreTraining.__init__  s4     f%	'/ 	r   c                 B    | j                   j                  j                  S r  r  r  r|  r  s    r1   get_output_embeddingsz(BertForPreTraining.get_output_embeddings      xx##+++r   c                     || j                   j                  _        |j                  | j                   j                  _        y r  r  r  r|  r9   r   new_embeddingss     r1   set_output_embeddingsz(BertForPreTraining.set_output_embeddings   ,    '5$$2$7$7!r   r   r   ry   rv   r   r   labelsnext_sentence_labelr   rY  rZ  r   c                 
   ||n| j                   j                  }| j                  |||||||	|
|	      }|dd \  }}| j                  ||      \  }}d}|u|st	               } ||j                  d| j                   j                        |j                  d            } ||j                  dd      |j                  d            }||z   }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                        S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        Nr   ry   rv   r   r   r   rY  rZ  r=   rw   )r  r  r  r   r_  )
r^   r  r  r  r	   r   r   r  r   r_  )r   r   r   ry   rv   r   r   r  r  r   rY  rZ  r.  r  rr  r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr"  s                         r1   r   zBertForPreTraining.forward$  sG   V &1%<k$++B]B]))))%'/!5#  

 *1!&48HH_m4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J')?@712;NF/9/EZMF*Q6Q'/$:!//))
 	
r   NNNNNNNNNNN)r   r   r   _tied_weights_keysr}   r  r  r!   r   rZ   r   r   r   r   r  r   r   r   s   @r1   r  r    sC    56VW,8  -11515/3,004)-6:,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
 &L
 &ell3L
 $D>L
 'tnL
 d^L
 
uU\\"$<<	=L
 L
r   r  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c            $           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   dee	j                     deee	j                     ef   f d       Z xZS )BertLMHeadModelzcls.predictions.decoder.biasr  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r|   r}   r   rC   warningr  r  r  r  r  r   s     r1   r}   zBertLMHeadModel.__init__|  sL       NNijf>	"6* 	r   c                 B    | j                   j                  j                  S r  r  r  s    r1   r  z%BertLMHeadModel.get_output_embeddings  r  r   c                     || j                   j                  _        |j                  | j                   j                  _        y r  r  r  s     r1   r  z%BertLMHeadModel.set_output_embeddings  r  r   r   r   ry   rv   r   r   r   rF  r  rW  rX  r   rY  rZ  r   r   c                    ||n| j                   j                  }|	d}| j                  |||||||||
|||||      }|d   }| j                  |      }d}|	) | j                  ||	| j                   j
                  fi |}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        NF)r   ry   rv   r   r   r   rF  rW  rX  r   rY  rZ  r   r   r=   )r  logitsrW  r   r_  r`  )r^   r  r  r  loss_functionr   r   rW  r   r_  r`  )r   r   r   ry   rv   r   r   r   rF  r  rW  rX  r   rY  rZ  r   loss_kwargsr.  r  r  lm_lossr"  s                         r1   r   zBertLMHeadModel.forward  s   4 &1%<k$++B]B]I))))%'"7#9+/!5#)  
" "!* HH_5(d(():FDKKDZDZj^ijG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   )NNNNNNNNNNNNNNN)r   r   r   r  r}   r  r  r!   r   rZ   r   r  r   r   r   r   r   r   r   s   @r1   r  r  t  s    9:Z[
,8  -11515/3,0048<9=)-8<$(,0/3&*15!@
ELL)@
 !.@
 !.	@

 u||,@
 ELL)@
  -@
  (5@
 !) 6@
 &@
 "$u||"45@
 D>@
 $D>@
 'tn@
 d^@
  !.!@
$ 
uU\\"$EE	F%@
 @
r   r  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       ZddZedefd       Z xZS )BertForMaskedLMr  r  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  r   s     r1   r}   zBertForMaskedLM.__init__  sR     NN1
 f>	"6* 	r   c                 B    | j                   j                  j                  S r  r  r  s    r1   r  z%BertForMaskedLM.get_output_embeddings  r  r   c                     || j                   j                  _        |j                  | j                   j                  _        y r  r  r  s     r1   r  z%BertForMaskedLM.set_output_embeddings  r  r   r   r   ry   rv   r   r   r   rF  r  r   rY  rZ  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|d   }| j                  |      }d}|	Ft	               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r   ry   rv   r   r   r   rF  r   rY  rZ  r   rw   r=   r  r  r   r_  )
r^   r  r  r  r	   r   r   r   r   r_  )r   r   r   ry   rv   r   r   r   rF  r  r   rY  rZ  r.  r  r  r  r  r"  s                      r1   r   zBertForMaskedLM.forward  s    . &1%<k$++B]B]))))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    |j                   }|d   }| j                  j                  t        d      t	        j
                  ||j                  |j                   d   df      gd      }t	        j                  |df| j                  j                  t        j                  |j                        }t	        j
                  ||gd      }||dS )Nr   z.The PAD token should be defined for generationr$   rw   r   r   )r   r   )
rW   r^   r   rX   rZ   cat	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r1   prepare_inputs_for_generationz-BertForMaskedLM.prepare_inputs_for_generation'  s    oo*1~ ;;##+MNNNN4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy+6A>	&.IIr   c                      y)z
        Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
        `prepare_inputs_for_generation` method.
        Fr.   )r  s    r1   can_generatezBertForMaskedLM.can_generate7  s     r   )NNNNNNNNNNNNr  )r   r   r   r  r}   r  r  r!   r   rZ   r   r   r   r   r   r   r  classmethodr  r   r   s   @r1   r  r    sj   46VW,8  -11515/3,0048<9=)-,0/3&*7
ELL)7
 !.7
 !.	7

 u||,7
 ELL)7
  -7
  (57
 !) 67
 &7
 $D>7
 'tn7
 d^7
 
uU\\"N2	37
 7
rJ  T  r   r  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForNextSentencePredictionc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r  )r|   r}   r  r  r  r  r  r   s     r1   r}   z&BertForNextSentencePrediction.__init__F  s4     f%	"6* 	r   r   r   ry   rv   r   r   r  r   rY  rZ  r   c                    d|v r+t        j                  dt               |j                  d      }|
|
n| j                  j
                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|2t               } ||j                  dd      |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r$   rw   r=   r  )warningswarnFutureWarningpopr^   r  r  r  r	   r   r   r   r_  )r   r   r   ry   rv   r   r   r  r   rY  rZ  kwargsr.  rr  seq_relationship_scoresr  r  r"  s                     r1   r   z%BertForNextSentencePrediction.forwardO  s   T !F*MM%
 ZZ 56F%0%<k$++B]B]))))%'/!5#  

  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//))	
 	
r   
NNNNNNNNNN)r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r
  r
  @  s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   r
  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForSequenceClassificationc                 n   t         |   |       |j                  | _        || _        t	        |      | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y r  )r|   r}   
num_labelsr^   r  r  classifier_dropoutr   r   r   r   r   r   r<   r  r   r^   r  r   s      r1   r}   z&BertForSequenceClassification.__init__  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   ry   rv   r   r   r  r   rY  rZ  r   c                 @   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r$   
regressionsingle_label_classificationmulti_label_classificationrw   r=   r  )r^   r  r  r   r<   problem_typer  r{   rZ   r   rU   r
   squeezer	   r   r   r   r   r_  )r   r   r   ry   rv   r   r   r  r   rY  rZ  r.  rr  r  r  r  r"  s                    r1   r   z%BertForSequenceClassification.forward  s   ( &1%<k$++B]B]))))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r  r    s     -11515/3,004)-,0/3&*E
ELL)E
 !.E
 !.	E

 u||,E
 ELL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\\"$<<	=E
 E
r   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForMultipleChoicec                 *   t         |   |       t        |      | _        |j                  |j                  n|j
                  }t        j                  |      | _        t        j                  |j                  d      | _        | j                          y )Nr$   )r|   r}   r  r  r  r   r   r   r   r   r   r<   r  r  s      r1   r}   zBertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	r   r   r   ry   rv   r   r   r  r   rY  rZ  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr$   rw   r   r  r=   r  )r^   r  rW   r   r   r  r   r<   r	   r   r   r_  )r   r   r   ry   rv   r   r   r  r   rY  rZ  num_choicesr.  rr  r  reshaped_logitsr  r  r"  s                      r1   r   zBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r!  r!    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   r!  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y NFr  )r|   r}   r  r  r  r  r   r   r   r   r   r   r<   r  r  s      r1   r}   z#BertForTokenClassification.__init__p  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   ry   rv   r   r   r  r   rY  rZ  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rw   r=   r  )r^   r  r  r   r<   r	   r   r  r   r   r_  )r   r   r   ry   rv   r   r   r  r   rY  rZ  r.  r  r  r  r  r"  s                    r1   r   z"BertForTokenClassification.forward~  s    $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r'  r'  n  s     -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\\"$99	:2
 2
r   r'  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   de	e
ej                     ef   fd       Z xZS )BertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r)  )
r|   r}   r  r  r  r   r   r   
qa_outputsr  r   s     r1   r}   z!BertForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r   r   r   ry   rv   r   r   start_positionsend_positionsr   rY  rZ  r   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r$   rw   r   )ignore_indexr=   )r  start_logits
end_logitsr   r_  )r^   r  r  r.  rN   r  r   rT   r   clampr	   r   r   r_  )r   r   r   ry   rv   r   r   r/  r0  r   rY  rZ  r.  r  r  r3  r4  r  ignored_indexr  
start_lossend_lossr"  s                          r1   r   z BertForQuestionAnswering.forward  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r,  r,    s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
uU\\"$@@	A>
 >
r   r,  )r  r!  r
  r  r,  r  r'  r=  r  r  r  rm   )Vr   r   rE   r  dataclassesr   typingr   r   rZ   torch.utils.checkpoint	packagingr   r   torch.nnr   r	   r
   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr    r!   r"   r#   configuration_bertr%   
get_loggerr   rC   rm   Modulero   r   r   r  r   r  r0  r9  r=  rO  rl  rt  ry  r  r  r  r  r  r  r  r  r  r
  r  r!  r'  r,  __all__r.   r   r1   <module>rK     s#      	  ! "     A A ! 5 ) w 9
 
 
 . l l L L * 
		H	%FR=RYY =@@.		 @.Fl!- l!^RYY  ! 2BII 2jryy  B* BJY
")) Y
x ")) "299 .!bii !&bii &	9299 	9 %/ % %4 
:{ : :& 	x
# x
x
v `
, `
`
F 
W
)? W

W
t i) i iX 
\
$7 \

\
~ V
$7 V
V
r g
/ g
 g
T B
!4 B
 B
J J
2 J
 J
Zr   