
    rh                     8   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z)d Z* G d dejV                        Z,e# G d de             Z- G d dejV                        Z. G d dejV                        Z/ G d dejV                        Z0 G d dejV                        Z1 G d dejV                        Z2 G d d ejV                        Z3 G d! d"ejV                        Z4 G d# d$e      Z5 G d% d&ejV                        Z6 G d' d(ejV                        Z7 G d) d*ejV                        Z8e# G d+ d,e-             Z9 G d- d.ejV                        Z:e# G d/ d0e-             Z; G d1 d2ejV                        Z< e#d34       G d5 d6e-             Z=e# G d7 d8e-             Z>e# G d9 d:e-             Z?e# G d; d<e-             Z@g d=ZAy)>zPyTorch ConvBERT model.    N)
attrgetter)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )ConvBertConfigc                    	 ddl }t        j
                  j                  |      }t        j                  d|        |j                  j                  |      }i }|D ]A  \  }}t        j                  d| d|        |j                  j                  ||      }	|	||<   C ddd	d
dddd}
|j                  dkD  rd}nd}t        |j                        D ]:  }d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d |
d| d!<   d| d"|
d| d#<   d| d$|
d| d%<   d| d&|
d| d'<   d| d(|
d| d)<   d| d*|
d| d+<   d| d,|
d| d-<   d| d.|
d| d/<   d| d0|
d| d1<   d| d2|
d| d3<   d| d4|
d| d5<   d| d6| d7|
d| d8<   d| d6| d9|
d| d:<   d| d;| d7|
d| d<<   d| d;| d9|
d| d=<   d| d>|
d| d?<   d| d@|
d| dA<   = | j                         D ]  }|d   }t        |      } ||       }|
|   }t!        j"                  ||         }t        j                  dB| dC| dD       |j%                  d7      r.|j%                  dE      s|j%                  dF      s|j&                  }|j%                  dG      r|j)                  ddHd      }|j%                  dI      r|j)                  dHdd      }|j%                  dJ      r|j+                  dK      }||_         | S # t        $ r t        j                  d        w xY w)Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   g_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variable
num_groupsrangenum_hidden_layersnamed_parametersr   torch
from_numpyendswithTpermute	unsqueezedata)modelconfigtf_checkpoint_pathtftf_path	init_varstf_datanameshapearrayparam_mappinggroup_dense_namejparam
param_name	retrieverresulttf_namevalues                      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbertrM   1   so    ggoo01G
KK8	BC''0IG  e(l5'BC&&w5 .R1Y3]'K%H%H#DM 1$"6++, Cw$QC'CD 	qc)EFG %QC'AB 	qc)CDE %QC'AB 	qc)CDE %QC'?@ 	qc)ABC %QC'CD 	qc)EFG %QC'AB 	qc)CDE %QC'UV 	qc)]^_ %QC'UV 	qc)]^_ %QC'IJ 	qc)QRS %QC'NO 	qc)QRS %QC'LM 	qc)OPQ %QC'MN 	qc)NOP %QC'KL 	qc)LMN %QC'EF 	qc)GHI %QC'HI 	qc)KLM %QC'CD 	qc)EFG %QC'GH 	qc)IJK %QC~6F5GwO 	qc)CDE %QC~6F5GuM 	qc)ABC %QCx0@/AI 	qc)=>? %QCx0@/AG 	qc);<= %QC'>? 	qc)ABC G]]^\__uDvqc)?@AGCwJ '') 1X
z*	5!
+  !12d7)6*Q78I&##$BC''(@A!GGE/0MM!Q*E/0MM!Q*E12OOB'E#$ Lk  Q	
 	s   L+ + Mc                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     deej                     dej                  f
dZ	 xZ
S )
ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_ids)r   r"   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr2   arangeexpandzerosrT   sizelongselfr:   	__class__s     rL   rZ   zConvBertEmbeddings.__init__   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsrV   rT   inputs_embedsreturnc                 2   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	| j                  |      }
||	z   |
z   }| j                  |      }| j                  |      }|S )Nr"   r   rV   r   rX   device)rm   rT   hasattrrV   rk   r2   rl   rn   rx   r_   ra   rc   rd   rh   )rp   rs   rV   rT   rt   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedra   rc   
embeddingss               rL   forwardzConvBertEmbeddings.forward   s,     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M"66|D $ : :> J"%88;PP
^^J/
\\*-
rr   )NNNN)__name__
__module____qualname____doc__rZ   r   r2   
LongTensorFloatTensorr   __classcell__rq   s   @rL   rO   rO      s    Q
( 15593759$E,,-$ !!1!12$ u//0	$
   1 12$ 
		$rr   rO   c                   *    e Zd ZU eed<   eZdZdZd Z	y)ConvBertPreTrainedModelr:   convbertTc                 l   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yt        |t              r%|j                  j
                  j                          yt        |t               r`|j                  j
                  j                  d| j                  j                         |j                  j
                  j                          yy)zInitialize the weights        meanstdNg      ?)
isinstancer   LinearConv1dweightr8   normal_r:   initializer_rangebiaszero_r[   rQ   rd   fill_SeparableConv1DGroupedLinearLayer)rp   modules     rL   _init_weightsz%ConvBertPreTrainedModel._init_weights   sw   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)0KK""$ 23MM&&CT[[5R5R&SKK""$ 4rr   N)
r   r   r   r   __annotations__rM   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr    rr   rL   r   r      s    1O"&*#%rr   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )r   zSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    t         |           t        j                  |||||dz  d      | _        t        j                  ||dd      | _        t        j                  t        j                  |d            | _	        | j                  j                  j                  j                  d|j                         | j
                  j                  j                  j                  d|j                         y )Nr!   F)kernel_sizegroupspaddingr   r   )r   r   r   r   )rY   rZ   r   r   	depthwise	pointwise	Parameterr2   rl   r   r   r8   r   r   )rp   r:   input_filtersoutput_filtersr   kwargsrq   s         rL   rZ   zSeparableConv1D.__init__	  s    # 1$
 =.aV[\LL^Q!?@	""**9Q9Q*R""**9Q9Q*Rrr   hidden_statesru   c                 h    | j                  |      }| j                  |      }|| j                  z  }|S N)r   r   r   )rp   r   xs      rL   r   zSeparableConv1D.forward  s0    NN=)NN1	TYYrr   	r   r   r   r   rZ   r2   Tensorr   r   r   s   @rL   r   r     s'    ]S U\\ ell rr   r   c                        e Zd Z fdZ	 	 	 	 d	dej
                  deej                     deej                     deej
                     dee   de	ej
                  eej
                     f   fdZ
 xZS )
ConvBertSelfAttentionc                 j   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  |j                  z  }|dk  r|j                  | _        d| _        n|| _        |j                  | _        |j                  | _        |j                  | j                  z  dk7  rt        d      |j                  | j                  z  dz  | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        ||j                  | j                  | j                        | _        t        j                  | j                  | j                  | j                  z        | _        t        j                  |j                  | j                        | _        t        j&                  | j                  dgt)        | j                  dz
  dz        dg	      | _        t        j,                  |j.                        | _        y )
Nr   r]   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr!   )r   r   )rY   rZ   hidden_sizenum_attention_headsry   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   r   querykeyrK   r   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldrf   attention_probs_dropout_probrh   )rp   r:   new_num_attention_headsrq   s      rL   rZ   zConvBertSelfAttention.__init__!  s>    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)"<"<@Q@Q"Q"Q&$88DO'(D$'>D$$//DO & 7 7 8 88A=UVV$*$6$6$:R:R$RWX#X !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
#2F&&(:(:D<Q<Q$
  "$4+=+=t?W?WZ^ZoZo?o!p ii(:(:D<N<NOii..2S$BWBWZ[B[_`A`=acd<e
 zz&"E"EFrr   r   attention_mask	head_maskencoder_hidden_statesoutput_attentionsru   c                 t   |j                   \  }}}|#| j                  |      }	| j                  |      }
n"| j                  |      }	| j                  |      }
| j                  |j	                  dd            }|j	                  dd      }| j                  |      }|j                  |d| j                  | j                        j	                  dd      }|	j                  |d| j                  | j                        j	                  dd      }|
j                  |d| j                  | j                        j	                  dd      }t        j                  ||      }| j                  |      }t        j                  |d| j                  dg      }t        j                  |d      }| j                  |      }t        j                  ||d| j                   g      }|j	                  dd      j#                         j%                  d      }t&        j(                  j+                  || j                  dgd| j                  dz
  dz  dgd      }|j	                  dd      j                  |d| j                   | j                        }t        j                  |d| j                  | j                  g      }t        j,                  ||      }t        j                  |d| j                   g      }t        j,                  ||j	                  dd            }|t/        j0                  | j                        z  }|||z   }t&        j(                  j                  |d      }| j3                  |      }|||z  }t        j,                  ||      }|j5                  dddd      j#                         }t        j                  ||d| j                  | j                  g      }t        j6                  ||gd      }|j9                         d d | j                  | j                  z  dz  fz   } |j                  | }|r||f}|S |f}|S )	Nr   r!   r"   dimr   )r   dilationr   strider   )rA   r   rK   r   	transposer   viewr   r   r2   multiplyr   reshaper   softmaxr   r   
contiguousr7   r   
functionalr   matmulmathsqrtrh   r6   catrm   )rp   r   r   r   r   r   
batch_sizer{   _mixed_key_layermixed_value_layermixed_key_conv_attn_layermixed_query_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputss                            rL   r   zConvBertSelfAttention.forwardH  s    %2$7$7!
J !,"hh'<=O $

+@ A"hh}5O $

= 9$($<$<]=T=TUVXY=Z$[!$=$G$G1$M! JJ}5',,D44d6N6N

)Aq/ 	 $((R9Q9QSWSkSklvvq
	 (,,D44d6N6N

)Aq/ 	  ..)BDUV 22?C!MM*;b$BWBWYZ=[\!MM*;C,,];~
BHZHZ7[\'11!Q7BBDNNrR--..2++a/A5q9 . 
 (11!Q7??D..0E0E
 ~D<T<TVZVkVk7lmn6GH~D<N<N7OP !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF==*b$BZBZ\`\t\t1uv		=(";Q? #0"4"4"6s";$$t'?'??!C?
 #
 +**,CD6G=/2 O\M]rr   NNNF)r   r   r   rZ   r2   r   r   r   booltupler   r   r   s   @rL   r   r      s    %GT 7;158<,1V||V !!2!23V E--.	V
  (5V $D>V 
u||Xell33	4Vrr   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrR   )rY   rZ   r   r   r   r   rd   re   rf   rg   rh   ro   s     rL   rZ   zConvBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rr   r   input_tensorru   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   rh   rd   rp   r   r   s      rL   r   zConvBertSelfOutput.forward  7    

=1]3}|'CDrr   r   r   r   rZ   r2   r   r   r   r   s   @rL   r   r     s1    >U\\  RWR^R^ rr   r   c                        e Zd Z fdZd Z	 	 	 	 d
dej                  deej                     deej                     deej                     dee	   de
ej                  eej                     f   fd	Z xZS )ConvBertAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rY   rZ   r   rp   r   outputsetpruned_headsro   s     rL   rZ   zConvBertAttention.__init__  s0    )&1	(0Err   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rp   r   r   r   r   r   r   rK   r   r   r   union)rp   headsindexs      rL   prune_headszConvBertAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rr   r   r   r   r   r   ru   c                 l    | j                  |||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )rp   r   )	rp   r   r   r   r   r   self_outputsattention_outputr   s	            rL   r   zConvBertAttention.forward  sQ     yy!
  ;;|AF#%QR(88rr   r   )r   r   r   rZ   r  r2   r   r   r   r   r   r   r   r   s   @rL   r   r     s    ";* 7;158<,1|| !!2!23 E--.	
  (5 $D> 
u||Xe&7&788	9rr   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )r   c                    t         |           || _        || _        || _        | j                  | j                  z  | _        | j                  | j                  z  | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        j                  t        j                  |            | _        y r   )rY   rZ   
input_sizeoutput_sizer.   group_in_dimgroup_out_dimr   r   r2   emptyr   r   )rp   r  r  r.   rq   s       rL   rZ   zGroupedLinearLayer.__init__  s    $&$ OOt>!--@ll5;;t@Q@QSWSeSe#fgLL[!9:	rr   r   ru   c                    t        |j                               d   }t        j                  |d| j                  | j
                  g      }|j                  ddd      }t        j                  || j                        }|j                  ddd      }t        j                  ||d| j                  g      }|| j                  z   }|S )Nr   r"   r   r!   )listrm   r2   r   r.   r  r6   r   r   r  r   )rp   r   r   r   s       rL   r   zGroupedLinearLayer.forward  s    -,,./2
MM-"doot?P?P)QRIIaALLDKK(IIaAMM!j"d.>.>?@		Mrr   r   r   s   @rL   r   r     s#    ;U\\ ell rr   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertIntermediatec                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y )Nr   r  r  r.   )rY   rZ   r.   r   r   r   intermediate_sizer   r   r   
hidden_actstrr   intermediate_act_fnro   s     rL   rZ   zConvBertIntermediate.__init__  s    !6#5#5v7O7OPDJ+!--6;S;S`f`q`qDJ f''-'-f.?.?'@D$'-'8'8D$rr   r   ru   c                 J    | j                  |      }| j                  |      }|S r   )r   r  rp   r   s     rL   r   zConvBertIntermediate.forward   s&    

=100?rr   r   r   s   @rL   r  r    s#    9U\\ ell rr   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertOutputc                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        y )Nr   r  rR   )rY   rZ   r.   r   r   r  r   r   r   rd   re   rf   rg   rh   ro   s     rL   rZ   zConvBertOutput.__init__  s    !6#;#;V=O=OPDJ+!33ASAS`f`q`qDJ f&8&8f>S>STzz&"<"<=rr   r   r   ru   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rL   r   zConvBertOutput.forward  r   rr   r   r   s   @rL   r  r    s1    	>U\\  RWR^R^ rr   r  c                       e Zd Z fdZ	 	 	 	 	 ddej
                  deej                     deej                     deej
                     deej
                     dee   de	ej
                  eej                     f   fd	Z
d
 Z xZS )ConvBertLayerc                 b   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is added)rY   rZ   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr  intermediater  r   ro   s     rL   rZ   zConvBertLayer.__init__  s    '-'E'E$*62 ++#)#=#= ##??4&(f ghh"3F";D08$V,rr   r   r   r   r   encoder_attention_maskr   ru   c                 >   | j                  ||||      }|d   }|dd  }	| j                  r?|=t        | d      st        d|  d      | j	                  |||||      }
|
d   }|	|
dd  z   }	t        | j                  | j                  | j                  |      }|f|	z   }	|	S )N)r   r   r   r)  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r%  r&  ry   AttributeErrorr)  r   feed_forward_chunkr#  r$  )rp   r   r   r   r   r+  r   self_attention_outputsr  r   cross_attention_outputslayer_outputs               rL   r   zConvBertLayer.forward(  s     "&/	 "0 "
 2!4(,??4@4!12$=dV DD D  '+&9&9 &%!'#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+rr   c                 L    | j                  |      }| j                  ||      }|S r   )r*  r   )rp   r  intermediate_outputr1  s       rL   r.  z ConvBertLayer.feed_forward_chunkP  s,    "//0@A{{#68HIrr   )NNNNF)r   r   r   rZ   r2   r   r   r   r   r   r   r.  r   r   s   @rL   r!  r!    s    -" 7;158<9=,1&||& !!2!23& E--.	&
  (5& !) 6& $D>& 
u||Xe&7&788	9&Prr   r!  c                        e Zd Z fdZ	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej
                     deej
                     dee   dee   d	ee   d
e	e
ef   fdZ xZS )ConvBertEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rY   rZ   r:   r   
ModuleListr/   r0   r!  layergradient_checkpointing)rp   r:   r   rq   s      rL   rZ   zConvBertEncoder.__init__W  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#r   r   r   r   r+  r   output_hidden_statesreturn_dictru   c	           	         |rdnd }	|rdnd }
|r| j                   j                  rdnd }t        | j                        D ]T  \  }}|r|	|fz   }	|||   nd } |||||||      }|d   }|s,|
|d   fz   }
| j                   j                  sL||d   fz   }V |r|	|fz   }	|st	        d ||	|
|fD              S t        ||	|
|      S )Nr   r   r   r!   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     rL   	<genexpr>z*ConvBertEncoder.forward.<locals>.<genexpr>  s      = s   )last_hidden_stater   
attentionscross_attentions)r:   r'  	enumerater8  r   r   )rp   r   r   r   r   r+  r   r:  r;  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                   rL   r   zConvBertEncoder.forward]  s$    #7BD$5b4%64;;;Z;Zr`d(4 	VOA|#$58H$H!.7.CilO(%&!M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(%	V(   1]4D D '):<OQef  
 2++*1	
 	
rr   )NNNNFFT)r   r   r   rZ   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r5  r5  V  s    , 7;158<9=,1/4&*0
||0
 !!2!230
 E--.	0

  (50
 !) 60
 $D>0
 'tn0
 d^0
 
u88	90
rr   r5  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rY   rZ   r   r   r   r   r   r  r  r   transform_act_fnrd   re   ro   s     rL   rZ   z(ConvBertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrr   r   ru   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   rO  rd   r  s     rL   r   z'ConvBertPredictionHeadTransform.forward  s4    

=1--m<}5rr   r   r   s   @rL   rM  rM    s$    UU\\ ell rr   rM  c                        e Zd ZdZdef fdZ	 ddej                  deej                     dej                  fdZ
 xZS )	ConvBertSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ConvBertConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r:   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)rY   rZ   getattrrT  NotImplementedErrorr   Identitysummaryry   rW  rX  
num_labelsr   r   r   
activationfirst_dropoutrZ  rf   last_dropoutr[  )rp   r:   num_classesactivation_stringrq   s       rL   rZ   z ConvBertSequenceSummary.__init__  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2rr   r   	cls_indexru   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        rU  Nr"   firstr   r   r   r   rf  .r   rW   )r"   rV  )rT  r   r2   	full_likerA   rn   r7   rk   r   rm   gathersqueezer]  rb  r_  ra  rc  )rp   r   rf  r   s       rL   r   zConvBertSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rr   r   )r   r   r   r   r   rZ   r2   r   r   r   r   r   r   s   @rL   rR  rR    sQ    2H~ H< Y])"..);CEDTDT;U)			)rr   rR  c                   6    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee   dee   dee   deeef   fd       Z xZS )ConvBertModelc                 "   t         |   |       t        |      | _        |j                  |j
                  k7  r/t        j                  |j                  |j
                        | _        t        |      | _
        || _        | j                          y r   )rY   rZ   rO   r~   r]   r   r   r   embeddings_projectr5  encoderr:   	post_initro   s     rL   rZ   zConvBertModel.__init__  sl     ,V4  F$6$66&(ii0E0EvGYGY&ZD#&v.rr   c                 .    | j                   j                  S r   r~   r_   rp   s    rL   get_input_embeddingsz"ConvBertModel.get_input_embeddings  s    ...rr   c                 &    || j                   _        y r   rs  )rp   rK   s     rL   set_input_embeddingsz"ConvBertModel.set_input_embeddings  s    */'rr   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrp  r8  r%  r  )rp   heads_to_pruner8  r  s       rL   _prune_headszConvBertModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Crr   rs   r   rV   rT   r   rt   r   r:  r;  ru   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  |
|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j#                  || j                   j$                        }| j                  ||||      }t        | d      r| j'                  |      }| j)                  ||||||		      }|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   z5You have to specify either input_ids or inputs_embeds)rx   rV   rw   )rs   rT   rV   rt   ro  )r   r   r   r:  r;  )r:   r   r:  use_return_dictr   %warn_if_padding_and_no_attention_maskrm   rx   r2   onesry   r~   rV   rk   rl   rn   get_extended_attention_maskget_head_maskr0   ro  rp  )rp   rs   r   rV   rT   r   rt   r   r:  r;  rz   r   r{   rx   r|   r}   extended_attention_maskr   s                     rL   r   zConvBertModel.forward!  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZFCN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z"&"B"B>S^"_&&y$++2O2OP	l>iv ( 
 4-. 33MBM2/!5# % 
 rr   )	NNNNNNNNN)r   r   r   rZ   ru  rw  r{  r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   rm  rm    s   
/0C  156:59371559,0/3&*<E,,-< !!2!23< !!1!12	<
 u//0< E--.<   1 12< $D>< 'tn< d^< 
u88	9< <rr   rm  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                     t         |           t        d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _
        y )NgelurR   )rY   rZ   r   ra  r   rd   r]   re   r   r   r   ro   s     rL   rZ   z%ConvBertGeneratorPredictions.__init__d  sV    (0f&;&;AVAVWYYv1163H3HI
rr   generator_hidden_statesru   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   ra  rd   )rp   r  r   s      rL   r   z$ConvBertGeneratorPredictions.forwardk  s3    

#:;6}5rr   )	r   r   r   r   rZ   r2   r   r   r   r   s   @rL   r  r  a  s+    KJu/@/@ UEVEV rr   r  c                   V    e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee   dee   dee   deeef   fd       Z xZS )ConvBertForMaskedLMzgenerator.lm_head.weightc                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  |j                        | _
        | j                          y r   )rY   rZ   rm  r   r  generator_predictionsr   r   r]   r\   generator_lm_headrq  ro   s     rL   rZ   zConvBertForMaskedLM.__init__w  sR     %f-%A&%I"!#6+@+@&BSBS!Trr   c                     | j                   S r   r  rt  s    rL   get_output_embeddingsz)ConvBertForMaskedLM.get_output_embeddings  s    %%%rr   c                     || _         y r   r  )rp   r_   s     rL   set_output_embeddingsz)ConvBertForMaskedLM.set_output_embeddings  s
    !0rr   rs   r   rV   rT   r   rt   labelsr   r:  r;  ru   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|Pt        j                         } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r"   r   losslogitsr   rB  )r:   r}  r   r  r  r   r	   r   r\   r   r   rB  )rp   rs   r   rV   rT   r   rt   r  r   r:  r;  r  generator_sequence_outputprediction_scoresr  loss_fctr   s                    rL   r   zConvBertForMaskedLM.forward  s   ( &1%<k$++B]B]"&-- 
#
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D'),CAB,GGF)-)9TGf$EvE$1??.99	
 	
rr   
NNNNNNNNNN)r   r   r   _tied_weights_keysrZ   r  r  r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r  s  s   45&1  156:59371559-1,0/3&*4
E,,-4
 !!2!234
 !!1!12	4

 u//04
 E--.4
   1 124
 ))*4
 $D>4
 'tn4
 d^4
 
un$	%4
 4
rr   r  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                 h   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        || _        y r   )rY   rZ   r   r   r   r   classifier_dropoutrg   rf   rh   r`  out_projr:   rp   r:   r  rq   s      rL   rZ   z#ConvBertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrr   r   ru   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )rh   r   r   r:   r  r  )rp   r   r   r   s       rL   r   z"ConvBertClassificationHead.forward  se    !Q'"LLOJJqM4;;))*1-LLOMM!rr   r   r   s   @rL   r  r    s&    7	U\\  rr   r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )!ConvBertForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y r   )	rY   rZ   r`  r:   rm  r   r  
classifierrq  ro   s     rL   rZ   z*ConvBertForSequenceClassification.__init__  sH      ++%f-4V< 	rr   rs   r   rV   rT   r   rt   r  r   r:  r;  ru   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rV   rT   r   rt   r   r:  r;  r   r   
regressionsingle_label_classificationmulti_label_classificationr"   r  )r:   r}  r   r  problem_typer`  rX   r2   rn   r   r
   rk  r	   r   r   r   r   rB  rp   rs   r   rV   rT   r   rt   r  r   r:  r;  r   sequence_outputr  r  r  r   s                    rL   r   z)ConvBertForSequenceClassification.forward  s   ( &1%<k$++B]B]--))%'/!5#   

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rr   r  )r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s     156:59371559-1,0/3&*D
E,,-D
 !!2!23D
 !!1!12	D

 u//0D
 E--.D
   1 12D
 ))*D
 $D>D
 'tnD
 d^D
 
u..	/D
 D
rr   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )ConvBertForMultipleChoicec                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y )Nr   )rY   rZ   rm  r   rR  sequence_summaryr   r   r   r  rq  ro   s     rL   rZ   z"ConvBertForMultipleChoice.__init__2  sM     %f- 7 ?))F$6$6: 	rr   rs   r   rV   rT   r   rt   r  r   r:  r;  ru   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r"   r   r  r   r  )r:   r}  rA   r   rm   r   r  r  r	   r   r   rB  )rp   rs   r   rV   rT   r   rt   r  r   r:  r;  num_choicesr   r  pooled_outputr  reshaped_logitsr  r  r   s                       rL   r   z!ConvBertForMultipleChoice.forward<  s   Z &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 --))%'/!5#   

 "!*--o>/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rr   r  )r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r  0  s     156:59371559-1,0/3&*Y
E,,-Y
 !!2!23Y
 !!1!12	Y

 u//0Y
 E--.Y
   1 12Y
 ))*Y
 $D>Y
 'tnY
 d^Y
 
u//	0Y
 Y
rr   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )ConvBertForTokenClassificationc                 `   t         |   |       |j                  | _        t        |      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )rY   rZ   r`  rm  r   r  rg   r   rf   rh   r   r   r  rq  r  s      rL   rZ   z'ConvBertForTokenClassification.__init__  s      ++%f-)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rr   rs   r   rV   rT   r   rt   r  r   r:  r;  ru   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r"   r   r  )r:   r}  r   rh   r  r	   r   r`  r   r   rB  r  s                    rL   r   z&ConvBertForTokenClassification.forward  s    $ &1%<k$++B]B]--))%'/!5#   

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rr   r  )r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s     156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
rr   r  c                   d    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eef   fd       Z xZS )ConvBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
rY   rZ   r`  rm  r   r   r   r   
qa_outputsrq  ro   s     rL   rZ   z%ConvBertForQuestionAnswering.__init__  sS      ++%f-))F$6$68I8IJ 	rr   rs   r   rV   rT   r   rt   start_positionsend_positionsr   r:  r;  ru   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r"   r   )ignore_indexr!   )r  start_logits
end_logitsr   rB  )r:   r}  r   r  splitrk  r   r  rm   clampr	   r   r   rB  )rp   rs   r   rV   rT   r   rt   r  r  r   r:  r;  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rL   r   z$ConvBertForQuestionAnswering.forward  s    &1%<k$++B]B]--))%'/!5#   

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rr   )NNNNNNNNNNN)r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s$     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
rr   r  )	r  r  r  r  r  r!  rm  r   rM   )Br   r   r'   operatorr   typingr   r   r   r2   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_convbertr   
get_loggerr   r%   rM   ModulerO   r   r   r   r   r   r   r  r  r!  r5  rM  rR  rm  r  r  r  r  r  r  r  __all__r   rr   rL   <module>r     s;     	  , ,    A A 1 9  . l l 3 
		H	%yx9 9x %o % %8bii 4~BII ~B *		 *Z ,299 (RYY &:. :z7
bii 7
tbii $`bii `F X+ X Xv299 $ H
1 H
 H
V 0 P
(? P
P
f e
 7 e
 e
P B
%< B
 B
J J
#: J
 J
Z
rr   