
    rh                        d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e&jR                  e*      Z+da,d Z-d Z.dIdZ/dIdZ0dIdZ1d Z2 G d dejf                  jh                        Z5 G d dejf                  jh                        Z6 G d d      Z7dJdZ8d Z9	 	 	 dKdZ: G d  d!e	jv                        Z< G d" d#e	jv                        Z= G d$ d%e	jv                        Z> G d& d'e	jv                        Z? G d( d)e	jv                        Z@ G d* d+e	jv                        ZA G d, d-e      ZB G d. d/e	jv                        ZC G d0 d1e	jv                        ZD G d2 d3e	jv                        ZE G d4 d5e	jv                        ZFe" G d6 d7e             ZGe" G d8 d9eG             ZHe" G d: d;eG             ZI G d< d=e	jv                        ZJ e"d>?       G d@ dAeG             ZKe" G dB dCeG             ZLe" G dD dEeG             ZMe" G dF dGeG             ZNg dHZOy)LzPyTorch MRA model.    N)Path)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_cuda_platformis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                      t        t              j                         j                  j                  j                  dz  dz  fd}  | g d      }t	        d|d      ay )Nkernelsmrac                 4    | D cg c]  }|z  	 c}S c c}w N )filesfile
src_folders     w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mra/modeling_mra.pyappend_rootz&load_cuda_kernels.<locals>.append_root4   s    .34d
T!444s   )zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verbose)r   __file__resolveparentr
   mra_cuda_kernel)r)   	src_filesr'   s     @r(   load_cuda_kernelsr1   0   sQ    h'')0077>>JURJ5 WXI=)TBO    c                 N   t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d      dk7  rt        d      | j                  d      dk7  rt        d      | j                  d	
      j                  j                  dd	      }|j                         }|j                         }|j                         }t        j                  ||||      \  }}|j                  dd	      dddddddf   }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr/   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r(   
sparse_maxrL   <   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::wP_an!oH'11"b9!Qa-H%%%r2   c                    t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d   |j                  d   k7  rt        d      | j                  \  }}||z  }t	        j
                  |j                  d      t        j                  |j                        }| j                  |||      } | |dddf   ||z  j                         ddf   } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r5   z$mask must be a 2-dimensional tensor.r6   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r<   r=   r>   shapetorcharangelongrP   reshape)maskrF   
block_size
batch_sizeseq_len	num_block	batch_idxs          r(   sparse_maskr\   X   s     499;1?@@
7<<>aBCCzz!}a((]^^**J:%IW\\!_EJJw~~VI<<
Iz:D	!T'"Wy%8$>$>$@!CDDKr2   c                 j   | j                         \  }}}|j                         \  }}}||z  dk7  rt        d      ||z  dk7  rt        d      | j                  |||z  ||      j                  dd      } |j                  |||z  ||      j                  dd      }t	        | j                               dk7  rt        d      t	        |j                               dk7  rt        d      t	        |j                               d	k7  rt        d
      | j                  d      dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | ||j                               S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r;   r8   r4   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r5   r6   r   r7   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r=   r>   rU   rA   r<   rB   rC   r/   mm_to_sparse)	dense_query	dense_keyrF   rW   rX   
query_sizer:   _key_sizes	            r(   r`   r`   o   s    #."2"2"4J
C ~~'AxJ!#opp*!kll%%j*
2JJX[\ffgikmnK!!*h*.DjRUV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''YNNr2   c                 B   |j                         \  }}}||z  dk7  rt        d      | j                  d      |k7  rt        d      | j                  d      |k7  rt        d      |j                  |||z  ||      j                  dd      }t	        | j                               d	k7  rt        d
      t	        |j                               d	k7  rt        d      t	        |j                               dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | |||      }|j                  dd      j                  |||z  |      }|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   r^   r5   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r;   r8   r4   ,sparse_query must be a 4-dimensional tensor.r_   r6   r7   z8The size of the third dimension of dense_key must be 32.)	r=   r>   rU   rA   r<   rB   rC   r/   sparse_dense_mm)	sparse_queryrF   rb   rG   rW   rX   re   r:   dense_qk_prods	            r(   rh   rh      s    !* 0J#*!kllz)lmmz)kll!!*h*.DjRUV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L'9VefM!++B3;;JZdHdfijMr2   c                 `    | |z  |z  t        j                  | |d      z   j                         S )Nfloorrounding_mode)rR   divrT   )rF   dim_1_blockdim_2_blocks      r(   transpose_indicesrr      s.    {"k1EIIg{bi4jjpprrr2   c                   >    e Zd Zed        Zed        Zedd       Zy)MraSampledDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S r#   )r`   save_for_backwardrW   )ctxra   rb   rF   rW   rE   s         r(   forwardzMraSampledDenseMatMul.forward   s1    %k9gzRk9g>#r2   c                    | j                   \  }}}| j                  }|j                  d      |z  }|j                  d      |z  }t        |||      }t	        |j                  dd      |||      }	t	        ||||      }
|
|	d d fS Nr   r;   r8   )saved_tensorsrW   r=   rr   rh   rA   )rw   gradra   rb   rF   rW   rG   rH   	indices_Tgrad_key
grad_querys              r(   backwardzMraSampledDenseMatMul.backward   s    *-*;*;'Y^^
%**1-;!q)Z7%gN	"4>>"b#99kS`a$T7IO
8T4//r2   c                 2    t         j                  | |||      S r#   )rt   apply)ra   rb   rF   rW   s       r(   operator_callz#MraSampledDenseMatMul.operator_call   s    $**;	7JWWr2   Nr7   __name__
__module____qualname__staticmethodrx   r   r   r$   r2   r(   rt   rt      s>      0 0 X Xr2   rt   c                   <    e Zd Zed        Zed        Zed        Zy)MraSparseDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S r#   )rh   rv   rG   )rw   ri   rF   rb   rG   rE   s         r(   rx   zMraSparseDenseMatMul.forward   s2    (w	?[lGY?-r2   c                     | j                   \  }}}| j                  }|j                  d      |j                  d      z  }t        |||      }t	        |j                  dd      |||      }t        |||      }	|	d |d fS rz   )r{   rG   r=   rr   rh   rA   r`   )
rw   r|   ri   rF   rb   rG   rH   r}   r~   r   s
             r(   r   zMraSparseDenseMatMul.backward   s    +.+<+<(gy--!q)\->->r-BB%gN	"<#9#9"b#A9dTab!$	7;
44//r2   c                 2    t         j                  | |||      S r#   )r   r   )ri   rF   rb   rG   s       r(   r   z"MraSparseDenseMatMul.operator_call   s    #)),O\\r2   Nr   r$   r2   r(   r   r      s>      0 0 ] ]r2   r   c                       e Zd Zed        Zy)MraReduceSumc                 B   | j                         \  }}}}t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                         \  }}}}|j                         \  }}| j                  d      j	                  ||z  |      } t        j                  |j                  d      t
        j                  |j                        }t        j                  ||d	      j                         |d d d f   |z  z   j	                  ||z        }	t        j                  ||z  |f| j                  | j                        }
|
j                  d|	|       j	                  |||      }|j	                  |||z        }|S )
Nr4   rg   r5   r6   r9   r   rN   rl   rm   )r=   r<   r>   sumrU   rR   rS   rT   rP   ro   zerosrO   	index_add)ri   rF   rG   rH   rX   rZ   rW   rd   r[   global_idxestempoutputs               r(   r   zMraReduceSum.operator_call   sy   /;/@/@/B,
Iz1|  "#q(KLLw||~!#FGG*//11j! '
I#''A'.66zI7MzZLLa

7>>Z	IIg}GDIIKiXY[_X_N`crNrr
'*y(
) 	 {{/):6l>P>PYeYlYl
 <>FFzSbdno
Oj,HIr2   N)r   r   r   r   r   r$   r2   r(   r   r      s     r2   r   c                 &   | j                         \  }}}||z  }d}	||j                  |||      j                  d      }
| j                  ||||      j                  d      |
dddddf   dz   z  }|j                  ||||      j                  d      |
dddddf   dz   z  }||j                  ||||      j                  d      |
dddddf   dz   z  }	n|t        j                  ||t        j
                  | j                        z  }
| j                  ||||      j                  d      }|j                  ||||      j                  d      }|$|j                  ||||      j                  d      }	t        j                  ||j                  dd            t        j                  |      z  }|j                  dd      j                  }|0|d	|
dddddf   |
dddddf   z  d
k  j                         z  z
  }||
||	fS )z/
    Compute low resolution approximation.
    Nr;   r9   r8   ư>rN   T)r:   keepdims     @g      ?)r=   rU   r   rR   onesfloatrP   meanmatmulrA   mathsqrtr?   r@   )querykeyrW   rV   valuerX   rY   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r(   get_low_resolution_logitr     sN    %*JJL!J:-Ill:/@*MQQVXQYMM*.?XVZZ_aZb1d
#d*
	 ++j*;ZRVV[]V^1d
#d*
 j2CZQYZ^^ce^fAq$J'$.I !5::j:KSXS^S^glgsgs#ttMM*.?XV[[`b[c	++j*;ZRWW\^W_j2CZQYZ__df_gI <<	73D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JIUUr2   c                    | j                   \  }}}|dkD  rf|dz  }t        j                  ||| j                        }	t        j                  t        j
                  |	|       |      }
| |
dddddf   dz  z   } |dkD  r:| ddd|ddf   dz   | ddd|ddf<   | ddddd|f   dz   | ddddd|f<   t        j                  | j                  |d      |ddd	
      }|j                  }|dk(  rE|j                  j                  d      j                  }| |ddddf   k\  j                         }||fS |dk(  rd}||fS t        | d      )zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r5   rP   )diagonalNg     @r;   TF)r:   largestsortedfullr9   sparsez# is not a valid approx_model value.)rQ   rR   r   rP   triltriutopkrU   rF   r@   minr   r>   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrX   total_blocks_per_rowrd   offset	temp_maskdiagonal_mask
top_k_valsrF   	thresholdhigh_resolution_masks                  r(   get_block_idxesr   7  s    +?*D*D'J$a&*0A5JJ35IRfRmRmn	

5::i6'#JU[\3mD!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4	!T4-8P PWWY ((( 
	 # ((( K=(KLMMr2   c	                    t         #t        j                  |       j                         S | j	                         \  }	}
}}|	|
z  }||z  dk7  rt        d      ||z  }| j                  |||      } |j                  |||      }|j                  |||      }|-| |dddddf   z  } ||dddddf   z  }||dddddf   z  }|dk(  rt        | ||||      \  }}}}nA|dk(  r1t        j                         5  t        | |||      \  }}}}ddd       nt        d      t        j                         5  z
  }t        |||||      \  }}ddd       t        j                  | ||      t        j                  |      z  }t        ||||      \  }}||z
  }|"|dd	t!        ||      dddddddf   z
  z  z
  }t        j"                  |      }t$        j                  ||||      }t&        j                  ||||      }|dk(  ryt        j"                  z
  dz  z
        dddddf   z  }t        j(                  |      dddddddf   j+                  d	d	|d	      j                  |||      }|j-                  d
      dddddf   j+                  d	d	|      j                  ||      }|j+                  d	d	|      j                  ||      |z
  } || |z  } t        j"                  | | dk  j/                         z        }!||!dddddf   z  }||!z  }t        j"                  |  | dkD  j/                         z        }"||"dddddf   z  }||"z  }||z   |dddddf   |dddddf   z   dz   z  }#n#|dk(  r||dddddf   dz   z  }#nt        d      ||#|dddddf   z  }#|#j                  |	|
||      }#|#S # 1 sw Y   xY w# 1 sw Y   xY w)z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rW   r   r   r;   r9   r   z-config.approx_mode must be "full" or "sparse")r/   rR   
zeros_likerequires_grad_r=   r>   rU   r   no_grad	Exceptionr   rt   r   r   r   rL   r\   expr   r   r   repeatr   r   )$r   r   r   rV   r   r   rW   r   r   rX   num_headrY   r   
meta_batchr   r   r   r   r   rd   low_resolution_logit_normalizedrF   r   high_resolution_logitrJ   rK   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r(   mra2_attentionr   ]  s    &5577.3jjl+J'8h&Jq OPP:-MM*gx8E
++j'8
4CMM*gx8EQ4Z((DAt$$Q4Z((fUm3
D%V
Rk+G 
	 ]]_ 	QisJRN +/KQ	 	
 @AA	 
*>A]*]'(7+(+)
%%
 2??sG
 @ 		( ",,A7L]_p!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu.?  ".!;!;g'8:K" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcelm 	" 6<<Q:NVVWacjknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*hRMS	 	
 
s   7O
3O
OO!c                   *     e Zd ZdZ fdZddZ xZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 p   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  dz   |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      dz          t+        |dd      | _        | j#                  dt%        j.                  | j0                  j3                         t$        j4                  | j0                  j6                  	      d
       y )N)padding_idxr5   epsposition_ids)r   r;   position_embedding_typeabsolutetoken_type_idsrN   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrR   rS   expandgetattrr   r   r   r=   rT   rP   selfconfig	__class__s     r(   r   zMraEmbeddings.__init__  s?   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mn'.v7PR\']$KK))..0

4K\K\KcKcd 	 	
r2   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr;   r   r   r   rN   r   )r=   r   hasattrr   r   rR   r   rT   rP   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r(   rx   zMraEmbeddings.forward  s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r2   )NNNNr   r   r   __doc__r   rx   __classcell__r   s   @r(   r   r     s    Q
( r2   r   c                   (     e Zd Zd fd	ZddZ xZS )MraSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      t        d u}t               r!t               rt               r|s	 t                |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t#        j$                  |j                  | j                         | _        t#        j$                  |j                  | j                         | _        t#        j$                  |j                  | j                         | _        t#        j,                  |j.                        | _        ||n|j2                  | _        |j4                  dz  |j6                  z  | _        t;        | j8                  t        |j4                  dz  dz              | _        |j<                  | _        |j>                  | _        |j@                  | _         y # t        $ r#}t        j                  d|        Y d }~d }~ww xY w)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r7   r5   )!r   r   r   num_attention_headsr   r>   r/   r   r   r   r1   r   loggerwarningrC   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   r   block_per_rowrZ   r   r   r   r   )r   r   r   kernel_loadeder   s        r(   r   zMraSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 (t3"$)9);@R@T]jn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$ !88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,+  n!hijhklmmns   
H6 6	I"?II"c           
      l   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }d|dz  z   }|j                         j                  d| j                  d      j                  || j                  z  |      j                         }d}	| j                  |	k  r|| j                  ||	| j                  z
  f}
t        j                  |t        j                  |
|j                        gd      }t        j                  |t        j                  |
|j                        gd      }t        j                  |t        j                  |
|j                        gd      }t!        |j#                         |j#                         |j#                         |j#                         | j$                  | j&                  | j(                  | j*                  	      }| j                  |	k  r|d d d d d d d | j                  f   }|j                  || j                  || j                        }|j-                  d
ddd      j/                         }|j1                         d d | j2                  fz   } |j                  | }|f}|S )Nr;   r   r5         ?r   r7   r   r9   )r   r   r   r   r   r8   )rQ   r   viewr  r  rA   r   r   squeezer   rU   rC   rR   catr   rP   r   r   rZ   r   r   r   permuterB   r=   r  )r   hidden_statesattention_maskrX   rY   rd   query_layer	key_layervalue_layergpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                 r(   rx   zMraSelfAttention.forward.  s   !.!4!4
GQJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ~77""$VAt//3WZ$":"::GDSU	 	 ##m3!4#;#;WmVZVnVnFnnH))[%++h{OaOa2b$ciklK		9ekk(9K[K[.\"]cefI))[%++h{OaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 ##m3)!Q3MT5M5M3M*MNM%--j$:R:RT[]a]u]uv%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD "r2   r#   r   r   r   r   rx   r  r	  s   @r(   r  r  
  s    !VF<r2   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )MraSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r  r   denser   r   r   r   r   r   s     r(   r   zMraSelfOutput.__init__o  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r2   r  input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r#   r-  r   r   r   r  r.  s      r(   rx   zMraSelfOutput.forwardu  7    

=1]3}|'CDr2   r   r   r   r   rR   Tensorrx   r  r	  s   @r(   r*  r*  n  1    >U\\  RWR^R^ r2   r*  c                   .     e Zd Zd fd	Zd ZddZ xZS )MraAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y )N)r   )r   r   r  r   r*  r   setpruned_heads)r   r   r   r   s      r(   r   zMraAttention.__init__}  s3    $VE\]	#F+Er2   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r9   )r<   r   r   r  r  r;  r   r   r   r   r   r-  r  union)r   headsindexs      r(   prune_headszMraAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r2   c                 f    | j                  ||      }| j                  |d   |      }|f|dd  z   }|S Nr   r   )r   r   )r   r  r   self_outputsattention_outputr'  s         r(   rx   zMraAttention.forward  s@    yy?;;|AF#%QR(88r2   r#   )r   r   r   r   r@  rx   r  r	  s   @r(   r8  r8  |  s    ";$r2   r8  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r#   )r   r   r   r  r   intermediate_sizer-  
isinstance
hidden_actstrr   intermediate_act_fnr   s     r(   r   zMraIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r2   r  r/  c                 J    | j                  |      }| j                  |      }|S r#   )r-  rL  r   r  s     r(   rx   zMraIntermediate.forward  s&    

=100?r2   r4  r	  s   @r(   rF  rF    s#    9U\\ ell r2   rF  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	MraOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r,  )r   r   r   r  rH  r   r-  r   r   r   r   r   r   s     r(   r   zMraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r2   r  r.  r/  c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r#   r1  r2  s      r(   rx   zMraOutput.forward  r3  r2   r4  r	  s   @r(   rP  rP    r6  r2   rP  c                   ,     e Zd Z fdZddZd Z xZS )MraLayerc                     t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        t        |      | _        t        |      | _
        y Nr   )r   r   chunk_size_feed_forwardseq_len_dimr8  	attentionadd_cross_attentionrF  intermediaterP  r   r   s     r(   r   zMraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r2   c                     | j                  ||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S rB  )rY  r   feed_forward_chunkrW  rX  )r   r  r   self_attention_outputsrD  r'  layer_outputs          r(   rx   zMraLayer.forward  sc    !%~!N1!4(,0##T%A%A4CSCSUe
  /G+r2   c                 L    | j                  |      }| j                  ||      }|S r#   )r[  r   )r   rD  intermediate_outputr_  s       r(   r]  zMraLayer.feed_forward_chunk  s,    "//0@A{{#68HIr2   r#   )r   r   r   r   rx   r]  r  r	  s   @r(   rT  rT    s    (r2   rT  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
MraEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersrT  layergradient_checkpointing)r   r   rd   r   s      r(   r   zMraEncoder.__init__  sN    ]]eFD\D\>]#^HV$4#^_
&+# $_s   A#c                     |rdnd }t        | j                        D ]  \  }}|r||fz   } |||      }	|	d   } |r||fz   }|st        d ||fD              S t        ||      S )Nr$   r   c              3   &   K   | ]	  }||  y wr#   r$   ).0vs     r(   	<genexpr>z%MraEncoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )last_hidden_stater  )	enumeraterh  tupler   )
r   r  r   	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss
             r(   rx   zMraEncoder.forward  s     #7BD(4 	-OA|#$58H$H!(GM)!,M	-   1]4D DX]4E$FXXX1++
 	
r2   )NNFTr(  r	  s   @r(   rc  rc    s    , "
r2   rc  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r,  )r   r   r   r  r   r-  rI  rJ  rK  r   transform_act_fnr   r   r   s     r(   r   z#MraPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr2   r  r/  c                 l    | j                  |      }| j                  |      }| j                  |      }|S r#   )r-  r|  r   rN  s     r(   rx   z"MraPredictionHeadTransform.forward  s4    

=1--m<}5r2   r4  r	  s   @r(   rz  rz    s$    UU\\ ell r2   rz  c                   *     e Zd Z fdZd Zd Z xZS )MraLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)bias)r   r   rz  	transformr   r  r   r   decoder	ParameterrR   r   r  r   s     r(   r   zMraLMPredictionHead.__init__  sm    3F; yy!3!3V5F5FUSLLV->->!?@	 !IIr2   c                 :    | j                   | j                  _         y r#   )r  r  r   s    r(   _tie_weightsz MraLMPredictionHead._tie_weights  s     IIr2   c                 J    | j                  |      }| j                  |      }|S r#   )r  r  rN  s     r(   rx   zMraLMPredictionHead.forward   s$    }5]3r2   )r   r   r   r   r  rx   r  r	  s   @r(   r  r    s    &&r2   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraOnlyMLMHeadc                 B    t         |           t        |      | _        y r#   )r   r   r  predictionsr   s     r(   r   zMraOnlyMLMHead.__init__(  s    .v6r2   sequence_outputr/  c                 (    | j                  |      }|S r#   )r  )r   r  prediction_scoress      r(   rx   zMraOnlyMLMHead.forward,  s     ,,_=  r2   r4  r	  s   @r(   r  r  '  s#    7!u|| ! !r2   r  c                   @    e Zd ZU eed<   dZdZdej                  fdZ	y)MraPreTrainedModelr   r!   Tmodulec                 H   | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsg        )r   stdNr  )r   initializer_rangerI  r   r  weightdatanormal_r  zero_r   r   r   fill_r  )r   r  r  s      r(   _init_weightsz MraPreTrainedModel._init_weights8  s"   kk++fbii( MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-KK""$MM$$S) 34KK""$ 5r2   N)
r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   Moduler  r$   r2   r(   r  r  1  s&     &*#%BII %r2   r  c                   *    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee   dee   deeef   fd       Z xZS )MraModelc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r#   )r   r   r   r   r  rc  encoder	post_initr   s     r(   r   zMraModel.__init__N  s;     '/!&) 	r2   c                 .    | j                   j                  S r#   r  r   r  s    r(   get_input_embeddingszMraModel.get_input_embeddingsX  s    ...r2   c                 &    || j                   _        y r#   r  )r   r   s     r(   set_input_embeddingszMraModel.set_input_embeddings[  s    */'r2   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rh  rY  r@  )r   heads_to_prunerh  r>  s       r(   _prune_headszMraModel._prune_heads^  sE    
 +002 	CLE5LLu%//;;EB	Cr2   r   r   r   r   rr  r   rs  rt  r/  c	                    ||n| j                   j                  }||n| j                   j                  }||t        d      |#| j	                  ||       |j                         }	n!||j                         d d }	nt        d      |	\  }
}||j                  n|j                  }|t        j                  |
|f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  |
|      }|}n&t        j                  |	t        j                  |      }| j                  ||	      }| j!                  || j                   j"                        }| j                  ||||      }| j%                  |||||      }|d	   }|s	|f|d
d  z   S t'        ||j(                  |j*                  |j,                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer;   z5You have to specify either input_ids or inputs_embedsr   r   rN   )r   r   r   r   )r   rr  rs  rt  r   r   )ro  r  
attentionscross_attentions)r   rs  use_return_dictr>   %warn_if_padding_and_no_attention_maskr=   rP   rR   r   r   r  r   r   r   rT   get_extended_attention_maskget_head_maskrg  r  r   r  r  r  )r   r   r   r   r   rr  r   rs  rt  r  rX   r  rP   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r(   rx   zMraModel.forwardf  s    %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r2   )NNNNNNNN)r   r   r   r   r  r  r  r   r   rR   r5  boolr   rq  r   rx   r  r	  s   @r(   r  r  L  s    /0C  -11515/3,004/3&*J
ELL)J
 !.J
 !.	J

 u||,J
 ELL)J
  -J
 'tnJ
 d^J
 
u88	9J
 J
r2   r  c                   L    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   deeef   fd       Z xZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r#   )r   r   r  r!   r  clsr  r   s     r(   r   zMraForMaskedLM.__init__  s4     F#!&) 	r2   c                 B    | j                   j                  j                  S r#   )r  r  r  r  s    r(   get_output_embeddingsz$MraForMaskedLM.get_output_embeddings  s    xx##+++r2   c                     || j                   j                  _        |j                  | j                   j                  _        y r#   )r  r  r  r  )r   new_embeddingss     r(   set_output_embeddingsz$MraForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r2   r   r   r   r   rr  r   labelsrs  rt  r/  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   rr  r   rs  rt  r   r;   r   losslogitsr  r  )
r   r  r!   r  r   r  r   r   r  r  )r   r   r   r   r   rr  r   r  rs  rt  r'  r  r  masked_lm_lossloss_fctr   s                   r(   rx   zMraForMaskedLM.forward  s    & &1%<k$++B]B](())%'!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r2   	NNNNNNNNN)r   r   r   _tied_weights_keysr   r  r  r   r   rR   r5  r  r   rq  r   rx   r  r	  s   @r(   r  r    s   :<Z[,8  -11515/3,004)-/3&*0
ELL)0
 !.0
 !.	0

 u||,0
 ELL)0
  -0
 &0
 'tn0
 d^0
 
un$	%0
 0
r2   r  c                   (     e Zd ZdZ fdZd Z xZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                 4   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        || _        y r#   )r   r   r   r  r   r-  r   r   r   
num_labelsout_projr   r   s     r(   r   zMraClassificationHead.__init__   sg    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr2   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r   r-  r   r   rJ  r  )r   featureskwargsxs       r(   rx   zMraClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r2   r  r	  s   @r(   r  r    s    7r2   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   de	e
ef   fd       Z xZS )MraForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        |      | _        | j                          y r#   )r   r   r  r  r!   r  
classifierr  r   s     r(   r   z%MraForSequenceClassification.__init__  sA      ++F#/7 	r2   r   r   r   r   rr  r   r  rs  rt  r/  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|
dd z   }||f|z   S |S t        |||
j                   |
j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr;   r  )r   r  r!   r  problem_typer  rO   rR   rT   rC   r	   r  r   r  r   r   r  r  )r   r   r   r   r   rr  r   r  rs  rt  r'  r  r  r  r  r   s                   r(   rx   z$MraForSequenceClassification.forward"  s   & &1%<k$++B]B](())%'!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r2   r  )r   r   r   r   r   r   rR   r5  r  r   rq  r   rx   r  r	  s   @r(   r  r    s      -11515/3,004)-/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
  -A
 &A
 'tnA
 d^A
 
u..	/A
 A
r2   r  c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   de	e
ef   fd       Z xZS )MraForMultipleChoicec                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  d      | _        | j                          y rV  )
r   r   r  r!   r   r  r   pre_classifierr  r  r   s     r(   r   zMraForMultipleChoice.__init__i  s_     F# ii(:(:F<N<NO))F$6$6: 	r2   r   r   r   r   rr  r   r  rs  rt  r/  c
           
         |	|	n| j                   j                  }	||j                  d   n|j                  d   }
|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }|dddf   }| j                  |      } t        j                         |      }| j                  |      }|j                  d|
      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r;   r8   r  r   r  )r   r  rQ   r  r=   r!   r  r   ReLUr  r   r   r  r  )r   r   r   r   r   rr  r   r  rs  rt  num_choicesr'  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r(   rx   zMraForMultipleChoice.forwards  s   V &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  	
 qz$QT*++M:!	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r2   r  )r   r   r   r   r   r   rR   r5  r  r   rq  r   rx   r  r	  s   @r(   r  r  g  s      -11515/3,004)-/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 'tnX
 d^X
 
u//	0X
 X
r2   r  c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   de	e
ef   fd       Z xZS )MraForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r#   )r   r   r  r  r!   r   r   r   r   r  r   r  r  r   s     r(   r   z"MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r2   r   r   r   r   rr  r   r  rs  rt  r/  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r;   r   r  )r   r  r!   r   r  r   r  r  rR   wheretensorignore_indextype_asr   r  r  )r   r   r   r   r   rr  r   r  rs  rt  r'  r  r  r  r  active_lossactive_logitsactive_labelsr   s                      r(   rx   z!MraForTokenClassification.forward  sh   " &1%<k$++B]B](())%'!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r2   r  )r   r   r   r   r   r   rR   r5  r  r   rq  r   rx   r  r	  s   @r(   r  r    s    	  -11515/3,004)-/3&*9
ELL)9
 !.9
 !.	9

 u||,9
 ELL)9
  -9
 &9
 'tn9
 d^9
 
u++	,9
 9
r2   r  c                   X    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   de	e
ef   fd       Z xZS )MraForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr5   )
r   r   r  r  r!   r   r  r   
qa_outputsr  r   s     r(   r   z MraForQuestionAnswering.__init__  s[      ++F#))F$6$68I8IJ 	r2   r   r   r   r   rr  r   start_positionsend_positionsrs  rt  r/  c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r;   r9   )r  r5   )r  start_logits
end_logitsr  r  )r   r  r!   r  splitr  r<   r=   clampr   r   r  r  )r   r   r   r   r   rr  r   r  r  rs  rt  r'  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r(   rx   zMraForQuestionAnswering.forward'  s    &1%<k$++B]B](())%'!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r2   )
NNNNNNNNNN)r   r   r   r   r   r   rR   r5  r  r   rq  r   rx   r  r	  s   @r(   r  r    s   
  -11515/3,0042604/3&*<
ELL)<
 !.<
 !.	<

 u||,<
 ELL)<
  -<
 "%,,/<
  -<
 'tn<
 d^<
 
u22	3<
 <
r2   r  )r  r  r  r  r  rT  r  r  r   )NN)r7   r   r   )Pr  r   pathlibr   typingr   r   rR   torch.utils.checkpointr   torch.nnr   r   r	   torch.utils.cpp_extensionr
   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_mrar   
get_loggerr   r  r/   r1   rL   r\   r`   rh   rr   autogradFunctionrt   r   r   r   r   r   r  r   r  r*  r8  rF  rP  rT  rc  rz  r  r  r  r  r  r  r  r  r  r  __all__r$   r2   r(   <module>r     s      "    A A * ! 9  . l l k k ( 
		H	%	C&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf7BII 7t`ryy `HBII 299 Bbii  		 ) :!
 !
J $")) 0!RYY ! % % %2 d
! d
 d
N D
' D
 D
PBII * L
#5 L
L
^ d
- d
 d
N F
 2 F
 F
R J
0 J
 J
Z	r2   