
    rhY                     n   d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%  e#jL                  e'      Z(da)d Z*d Z+d Z,d Z- G d dej\                  j^                        Z0 G d dej\                  j^                        Z1 G d de	jd                        Z3 G d de	jd                        Z4 G d de	jd                        Z5 G d de	jd                        Z6 G d  d!e	jd                        Z7 G d" d#e	jd                        Z8 G d$ d%e      Z9 G d& d'e	jd                        Z: G d( d)e	jd                        Z; G d* d+e	jd                        Z< G d, d-e	jd                        Z=e  G d. d/e             Z>e  G d0 d1e>             Z?e  G d2 d3e>             Z@ G d4 d5e	jd                        ZA e d67       G d8 d9e>             ZBe  G d: d;e>             ZCe  G d< d=e>             ZDe  G d> d?e>             ZEg d@ZFy)AzPyTorch YOSO model.    N)Path)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_ninja_availableis_torch_cuda_availablelogging   )
YosoConfigc                  H    ddl m}  d } |g d      } | d|d       dd lay )Nr   )loadc                     t        t              j                         j                  j                  j                  dz  dz  }| D cg c]  }||z  	 c}S c c}w )Nkernelsyoso)r   __file__resolveparent)files
src_folderfiles      y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/yoso/modeling_yoso.pyappend_rootz&load_cuda_kernels.<locals>.append_root9   sJ    (^++-44;;BBYNQWW
.34d
T!444s   A)zfast_lsh_cumulation_torch.cppzfast_lsh_cumulation.cuzfast_lsh_cumulation_cuda.cufast_lsh_cumulationT)verbose)torch.utils.cpp_extensionr   r*   lsh_cumulation)r   r)   	src_filess      r(   load_cuda_kernelsr/   5   s'    .5 vwI		480    c                     t        | t              r<g }| D ]3  }|j                         s|j                         }|j	                  |       5 |S | j                         s| j                         } | S N)
isinstancelistis_contiguous
contiguousappendinput_tensorsouttensors      r(   to_contiguousr<   D   sm    -&# 	F'')**,JJv	 
**,)446Mr0   c                     t        | t              r<g }| D ]3  }|j                  t        j                  j                  |dd             5 |S t        j                  j                  | dd      S )N   )pdim)r3   r4   r7   r   
functional	normalizer8   s      r(   rC   rC   R   se    -&# 	EFJJr}}..v.CD	E
}}&&}r&BBr0   c                 z   t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      t        j                  | j                  d      | j                  d      ||z  | j
                        }dt        j                  || j
                        z  }t        j                  | |      j                  | j                  d      | j                  d      ||      }t        j                  ||      j                  |j                  d      |j                  d      ||      }|dkD  j                         }|dkD  j                         }	t        j                  ||z  d	      }
t        j                  |	|z  d	      }
|
j                         |
j                         fS )
Nr
   zQuery has incorrect size.zKey has incorrect size.r   r>   devicer   r?   rA   )lensize
ValueErrortorchrandnrF   arangematmulreshapeintsum)querykeynum_hashhash_lenrmat	raise_powquery_projectionkey_projectionquery_binary
key_binary
query_hashs              r(   hashingr]   \   sX   
5::<A455
388:!233;;uzz!}ejjmX5HQVQ]Q]^DU\\(5<<@@I||E4088A

STW_aij\\#t,44SXXa[#((1+xYabN$q(--/L 1$))+J<)3<J:	1r:J>>Z^^---r0   c                   ,    e Zd Zed        Zed        Zy)YosoCumulationc           
      N   |d   }dt        j                  t        j                  ||j                  dd                  t        j
                  z  z
  |z  }||d d d d d f   z  |d d d d d f   z  }t        j                  ||      }	| j                  ||||||       || _        |	S )Nhash_code_lenr   r?   )rK   acosrN   	transposemathpisave_for_backwardconfig)
ctx
query_maskkey_maskrR   rS   valuerh   ra   expectationcumulation_values
             r(   forwardzYosoCumulation.forwardp   s    /5::ell5#--B:O&PQTXT[T[[[`mm!Jq!Tz$::Xaqj=QQ <<U;j(KUS
r0   c                    t        |      }| j                  \  }}}}}}| j                  }|d   }	t        j                  ||j                  dd            |z  }
t        j                  |
|	dz  |z        }t        j                  |
j                  dd      |	dz  |z        }t        j                  |j                  dd      |      }d d |||d fS )Nra   r?   rb   r>   )r<   saved_tensorsrh   rK   rN   rd   )ri   gradrj   rk   rm   rR   rS   rl   rh   ra   weighted_exp
grad_querygrad_key
grad_values                 r(   backwardzYosoCumulation.backward}   s    T"?B?P?P<
Hk5#u/||D%//"b*AB[P\\,1Bc0IJ
<< 6 6r2 >QRARV[@[\\\+"7"7B"?F
T:xTAAr0   N__name__
__module____qualname__staticmethodro   rw    r0   r(   r_   r_   o   s*    
  
  B Br0   r_   c                   ,    e Zd Zed        Zed        Zy)YosoLSHCumulationc           
         |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d	      t        |||||g      \  }}}}}|j                  }|d
   }|d   }	t	        d|	z        }
|d   r t
        j                  ||||||	|d      \  }}nt        ||||	      \  }}t
        j                  ||||||
|d      }| j                  |||||||       || _	        |S )Nr   z6Query mask and Key mask differ in sizes in dimension 0z3Query mask and Query differ in sizes in dimension 0z1Query mask and Key differ in sizes in dimension 0z8Query mask and Value mask differ in sizes in dimension 0r   z,Key and Value differ in sizes in dimension 1r>   z,Query and Key differ in sizes in dimension 2rT   ra   use_fast_hash)
rI   rJ   r<   is_cudarP   r-   	fast_hashr]   rg   rh   )ri   rj   rk   rR   rS   rl   rh   use_cudarT   ra   hashtable_capacityquery_hash_codekey_hash_codern   s                 r(   ro   zYosoLSHCumulation.forward   s   ??1q!11UVV??1A.RSS??1!,PQQ??1A.WXX88A;%**Q-'KLL::a=CHHQK'KLL2?XW\^ach@i2j/
HeS%%%*%/ M!12/"-;-E-EE8S(M8UV.*O] .5UC=-Y*O])88=%I[]egh
 	j(O]TY[^`ef
r0   c                    t        |      }| j                  \  }}}}}}}| j                  }	|j                  }
|	d   }t	        d|z        }|	d   rft
        j                  |||||||
d      }t
        j                  |||||||dz  |z  ||
d
      }t
        j                  |||||||dz  |z  ||
d
      }ndt        j                  t        j                  ||j                  dd                  t        j                  z  z
  |z  }||d d d d d f   z  |d d d d d f   z  }t        j                  ||j                  dd            |z  }t        j                  ||dz  |z        }t        j                  |j                  dd      |dz  |z        }t        j                  |j                  dd      |      }d d |||d fS )Nra   r>   lsh_backwardr      r?   rb   )r<   rq   rh   r   rP   r-   lsh_weighted_cumulationrK   rc   rN   rd   re   rf   )ri   rr   rj   rk   r   r   rR   rS   rl   rh   r   ra   r   rv   rt   ru   rm   rs   s                     r(   rw   zYosoLSHCumulation.backward   s   T"RURcRcO
Ho}eS%<</ M!12.!'66-_dL^`hjkJ (??"c)"J &=="e+"H uzz%,,ucmmBPR>S*TUX\X_X___dqqK%
1a:(>>!TST*AUUK <<eoob".EFTLl]Q5F#4MNJ||L$:$:2r$B]UVEVZ_D_`Hk&;&;B&CTJJT:xTAAr0   Nrx   r}   r0   r(   r   r      s+    #  # J .B .Br0   r   c                   *     e Zd ZdZ fdZddZ xZS )YosoEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 t   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  dz   |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      dz   d       t+        |dd	      | _        | j#                  d
t%        j.                  | j0                  j3                         t$        j4                  | j0                  j6                        d       y )N)padding_idxr>   epsposition_ids)r   r?   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtyperF   )super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrK   rM   expandgetattrr   zerosr   rI   longrF   selfrh   	__class__s     r(   r   zYosoEmbeddings.__init__   sL   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWX[\\in 	 	
 (/v7PR\']$KK))..0

4K\K\KcKcd 	 	
r0   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr?   r   r   r   r   r   )rI   r   hasattrr   r   rK   r   r   rF   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r(   ro   zYosoEmbeddings.forward  s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r0   )NNNNry   rz   r{   __doc__r   ro   __classcell__r   s   @r(   r   r      s    Q
, r0   r   c                   (     e Zd Zd fd	ZddZ xZS )YosoSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      t        d u}t               rt               r|s	 t                |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j*                  |j,                        | _        ||n|j0                  | _        |j2                  | _        |j4                  | _        |j6                  d u| _        |j:                  | _        |j<                  | _        |j>                  | _        | j4                  | j:                  | j<                  | j>                  d| _         |j6                  Zt!        jB                  |j                  |j                  |j6                  df|j6                  d	z  dfd
|j                        | _"        y y # t        $ r#}t        j                  d|        Y d }~Fd }~ww xY w)Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: )ra   r   rT   r   r   r>   F)in_channelsout_channelskernel_sizepaddingbiasgroups)#r   r   r   num_attention_headsr   rJ   r-   r   r   r/   	ExceptionloggerwarningrP   attention_head_sizeall_head_sizer   LinearrR   rS   rl   r   attention_probs_dropout_probr   r   use_expectationra   conv_windowuse_convr   rT   r   
lsh_configConv2dconv)r   rh   r   kernel_loadeder   s        r(   r   zYosoSelfAttention.__init__%  so    : ::a?PVXhHi#F$6$6#7 8 445Q8  'd2"$);)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$  &55#11**$6#11"// "//!// --	
 )		"66#77#//3++q0!411DI *=  n!hijhklmmns   =
J& &	K/KKc                 H   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  r| j                  |	|d d d d d d f   z        }
|j                         \  }}}}|j                  ||z  ||      }|j                  ||z  ||      }|	j                  ||z  ||      }	d|dz  z   }|j                  d      j                  |d      j                  ||z  |      j                         }d}| j                  s||k  r||z  |||z
  f}t!        j"                  |t!        j$                  ||j&                        gd      }t!        j"                  |t!        j$                  ||j&                        gd      }t!        j"                  |	t!        j$                  ||	j&                        gd      }	| j                  s| j(                  rt+        ||g      \  }}| j                  r%t,        j/                  |||||	| j0                        }n$t2        j/                  |||||	| j0                        }| j                  s||k  r|d d d d d |f   }t+        |      }|j                  ||||      }| j                  r|
z  }|j5                  d	ddd
      j7                         }|j                         d d | j8                  fz   } |j                  | }|r||f}|S |f}|S )Nr?   r   r>         ?g     @rG       rE   r   r
   rb   )shaperR   viewr   r   rd   rS   rl   r   r   rI   rO   	unsqueezerepeat_interleaverP   r   rK   catr   rF   trainingrC   r_   applyr   r   permuter6   r   )r   hidden_statesattention_maskoutput_attentions
batch_sizer   _query_layer	key_layervalue_layerconv_value_layer	num_headsseq_lenhead_dimgpu_warp_sizepad_sizecontext_layernew_context_layer_shapeoutputss                      r(   ro   zYosoSelfAttention.forwardX  s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ==#yy~aqRVFV7W)WX3>3C3C3E0
Iw!))*y*@'8T%%j9&<gxP	!))*y*@'8T~77$$Q'ya0WZ)+W5SU	 	 $$(]*B!I-w8PPH))KK1C1CD K 		KK1A1AB I  ))KK1C1CD K 4==%.Y/G%H"K*00YUYUdUdM .33YUYUdUdM $$(]*B)!Q		/:M!-0%--j)WhW==--M%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD4E=-0 MZK[r0   r2   NFry   rz   r{   r   ro   r   r   s   @r(   r   r   $  s    1f\r0   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )YosoSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r   r   denser   r   r   r   r   r   s     r(   r   zYosoSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r0   r   input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r2   r   r   r   r   r   r   s      r(   ro   zYosoSelfOutput.forward  7    

=1]3}|'CDr0   ry   rz   r{   r   rK   Tensorro   r   r   s   @r(   r   r     1    >U\\  RWR^R^ r0   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )YosoAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y )N)r   )r   r   r   r   r   outputsetpruned_heads)r   rh   r   r   s      r(   r   zYosoAttention.__init__  s3    %fF]^	$V,Er0   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rG   )rH   r   r   r   r   r
  r   rR   rS   rl   r  r   r   union)r   headsindexs      r(   prune_headszYosoAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r0   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r  )r   r   r   r   self_outputsattention_outputr   s          r(   ro   zYosoAttention.forward  sC    yy@QR;;|AF#%QR(88r0   r2   r   )ry   rz   r{   r   r  ro   r   r   s   @r(   r  r    s    ";$r0   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )YosoIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r2   )r   r   r   r   r   intermediate_sizer   r3   
hidden_actstrr   intermediate_act_fnr   s     r(   r   zYosoIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r0   r   r   c                 J    | j                  |      }| j                  |      }|S r2   )r   r  r   r   s     r(   ro   zYosoIntermediate.forward  s&    

=100?r0   r  r   s   @r(   r  r    s#    9U\\ ell r0   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
YosoOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r   r   r  r   r   r   r   r   r   r   r   s     r(   r   zYosoOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r0   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r2   r   r   s      r(   ro   zYosoOutput.forward  r  r0   r  r   s   @r(   r  r    r  r0   r  c                   ,     e Zd Z fdZddZd Z xZS )	YosoLayerc                     t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        t        |      | _        t        |      | _
        y Nr   )r   r   chunk_size_feed_forwardseq_len_dimr  	attentionadd_cross_attentionr  intermediater  r  r   s     r(   r   zYosoLayer.__init__  sW    '-'E'E$&v.#)#=#= ,V4 (r0   c                     | j                  |||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S )N)r   r   r   )r&  r   feed_forward_chunkr$  r%  )r   r   r   r   self_attention_outputsr  r   layer_outputs           r(   ro   zYosoLayer.forward  sh    !%~ar!s1!4(,0##T%A%A4CSCSUe
  /G+r0   c                 L    | j                  |      }| j                  ||      }|S r2   )r(  r  )r   r  intermediate_outputr,  s       r(   r*  zYosoLayer.feed_forward_chunk  s,    "//0@A{{#68HIr0   r   )ry   rz   r{   r   ro   r*  r   r   s   @r(   r!  r!    s    )r0   r!  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )YosoEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r   r   rh   r   
ModuleListrangenum_hidden_layersr!  layergradient_checkpointing)r   rh   r   r   s      r(   r   zYosoEncoder.__init__#  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                     |rdnd }|rdnd }t        | j                        D ](  \  }	}
|r||fz   } |
|||      }|d   }|s ||d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )Nr}   r   r   c              3   &   K   | ]	  }||  y wr2   r}   ).0vs     r(   	<genexpr>z&YosoEncoder.forward.<locals>.<genexpr>C  s     mq_`_lms   )last_hidden_stater   
attentions)	enumerater5  tupler   )r   r   r   	head_maskr   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_outputss               r(   ro   zYosoEncoder.forward)  s     #7BD$5b4(4 	POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#	P   1]4D Dm]4EGZ$[mmm1++*
 	
r0   )NNFFTr   r   s   @r(   r0  r0  "  s    , "
r0   r0  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )YosoPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r   r   r   r   r   r   r3   r  r  r   transform_act_fnr   r   r   s     r(   r   z$YosoPredictionHeadTransform.__init__M  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr0   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r2   )r   rK  r   r  s     r(   ro   z#YosoPredictionHeadTransform.forwardV  s4    

=1--m<}5r0   r  r   s   @r(   rI  rI  L  s$    UU\\ ell r0   rI  c                   *     e Zd Z fdZd Zd Z xZS )YosoLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)r   )r   r   rI  	transformr   r   r   r   decoder	ParameterrK   r   r   r   s     r(   r   zYosoLMPredictionHead.__init___  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr0   c                 :    | j                   | j                  _         y r2   )r   rQ  r   s    r(   _tie_weightsz!YosoLMPredictionHead._tie_weightsl  s     IIr0   c                 J    | j                  |      }| j                  |      }|S r2   )rP  rQ  r  s     r(   ro   zYosoLMPredictionHead.forwardo  s$    }5]3r0   )ry   rz   r{   r   rU  ro   r   r   s   @r(   rN  rN  ^  s    &&r0   rN  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )YosoOnlyMLMHeadc                 B    t         |           t        |      | _        y r2   )r   r   rN  predictionsr   s     r(   r   zYosoOnlyMLMHead.__init__w  s    /7r0   sequence_outputr   c                 (    | j                  |      }|S r2   )rZ  )r   r[  prediction_scoress      r(   ro   zYosoOnlyMLMHead.forward{  s     ,,_=  r0   r  r   s   @r(   rX  rX  v  s#    8!u|| ! !r0   rX  c                   @    e Zd ZU eed<   dZdZdej                  fdZ	y)YosoPreTrainedModelrh   r!   Tmodulec                 H   | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsg        )meanstdNr   )rh   initializer_ranger3   r   r   weightdatanormal_r   zero_r   r   r   fill_rN  )r   r`  rc  s      r(   _init_weightsz!YosoPreTrainedModel._init_weights  s"   kk++fbii( MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-KK""$MM$$S) 45KK""$ 6r0   N)
ry   rz   r{   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   Modulerj  r}   r0   r(   r_  r_    s$    &*#%BII %r0   r_  c                   6    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee   dee   dee   deeef   fd       Z xZS )	YosoModelc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r2   )r   r   rh   r   r   r0  encoder	post_initr   s     r(   r   zYosoModel.__init__  s;     (0"6* 	r0   c                 .    | j                   j                  S r2   r   r   rT  s    r(   get_input_embeddingszYosoModel.get_input_embeddings  s    ...r0   c                 &    || j                   _        y r2   ru  )r   rl   s     r(   set_input_embeddingszYosoModel.set_input_embeddings  s    */'r0   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrr  r5  r&  r  )r   heads_to_pruner5  r  s       r(   _prune_headszYosoModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr0   r   r   r   r   r@  r   r   rA  rB  r   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  || j                   j"                        }| j                  ||||      }| j%                  ||||||	      }|d	   }|	s	|f|d
d  z   S t'        ||j(                  |j*                  |j,                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer?   z5You have to specify either input_ids or inputs_embedsrE   r   r   )r   r   r   r   )r   r@  r   rA  rB  r   r   )r<  r   r=  cross_attentions)rh   r   rA  use_return_dictrJ   %warn_if_padding_and_no_attention_maskrI   rF   rK   onesr   r   r   r   r   r   get_head_maskr4  rr  r   r   r=  r~  )r   r   r   r   r   r@  r   r   rA  rB  r   r   r   rF   r   r   embedding_outputencoder_outputsr[  s                      r(   ro   zYosoModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z &&y$++2O2OP	??%)'	 + 
 ,,)/!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r0   )	NNNNNNNNN)ry   rz   r{   r   rv  rx  r|  r   r   rK   r  boolr   r?  r   ro   r   r   s   @r(   rp  rp    s   /0C  -11515/3,004,0/3&*I
ELL)I
 !.I
 !.	I

 u||,I
 ELL)I
  -I
 $D>I
 'tnI
 d^I
 
u88	9I
 I
r0   rp  c                   X    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   dee   deeef   fd       Z xZS )YosoForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r2   )r   r   rp  r!   rX  clsrs  r   s     r(   r   zYosoForMaskedLM.__init__  s4     f%	"6* 	r0   c                 B    | j                   j                  j                  S r2   )r  rZ  rQ  rT  s    r(   get_output_embeddingsz%YosoForMaskedLM.get_output_embeddings  s    xx##+++r0   c                     || j                   j                  _        |j                  | j                   j                  _        y r2   )r  rZ  rQ  r   )r   new_embeddingss     r(   set_output_embeddingsz%YosoForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r0   r   r   r   r   r@  r   labelsr   rA  rB  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   r@  r   r   rA  rB  r   r?   r   losslogitsr   r=  )
rh   r  r!   r  r   r   r   r   r   r=  )r   r   r   r   r   r@  r   r  r   rA  rB  r   r[  r]  masked_lm_lossloss_fctr  s                    r(   ro   zYosoForMaskedLM.forward  s    ( &1%<k$++B]B]))))%'/!5#  

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r0   
NNNNNNNNNN)ry   rz   r{   _tied_weights_keysr   r  r  r   r   rK   r  r  r   r?  r   ro   r   r   s   @r(   r  r    s   :<Z[,8  -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
r0   r  c                   (     e Zd ZdZ fdZd Z xZS )YosoClassificationHeadz-Head for sentence-level classification tasks.c                 4   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        || _        y r2   )r   r   r   r   r   r   r   r   r   
num_labelsout_projrh   r   s     r(   r   zYosoClassificationHead.__init__N  sg    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr0   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r   r   r   rh   r  r  )r   featureskwargsxs       r(   ro   zYosoClassificationHead.forwardV  se    Q1WLLOJJqM4;;))*1-LLOMM!r0   r   r   s   @r(   r  r  K  s    7r0   r  z
    YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ef   fd       Z xZS )YosoForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        |      | _        | j                          y r2   )r   r   r  rp  r!   r  
classifierrs  r   s     r(   r   z&YosoForSequenceClassification.__init__g  sA      ++f%	08 	r0   r   r   r   r   r@  r   r  r   rA  rB  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr?   r  )rh   r  r!   r  problem_typer  r   rK   r   rP   r	   squeezer   r   r   r   r   r=  )r   r   r   r   r   r@  r   r  r   rA  rB  r   r[  r  r  r  r  s                    r(   ro   z%YosoForSequenceClassification.forwardp  s   ( &1%<k$++B]B]))))%'/!5#  

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r0   r  )ry   rz   r{   r   r   r   rK   r  r  r   r?  r   ro   r   r   s   @r(   r  r  `  s     -11515/3,004)-,0/3&*C
ELL)C
 !.C
 !.	C

 u||,C
 ELL)C
  -C
 &C
 $D>C
 'tnC
 d^C
 
u..	/C
 C
r0   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ef   fd       Z xZS )YosoForMultipleChoicec                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  d      | _        | j                          y r#  )
r   r   rp  r!   r   r   r   pre_classifierr  rs  r   s     r(   r   zYosoForMultipleChoice.__init__  s_     f%	 ii(:(:F<N<NO))F$6$6: 	r0   r   r   r   r   r@  r   r  r   rA  rB  r   c                    |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }|dddf   }| j                  |      } t        j                         |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r?   rb   r  r   r  )rh   r  r   r   rI   r!   r  r   ReLUr  r   r   r   r=  )r   r   r   r   r   r@  r   r  r   rA  rB  num_choicesr   hidden_statepooled_outputr  reshaped_logitsr  r  r  s                       r(   ro   zYosoForMultipleChoice.forward  s    X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

 qz$QT*++M:!	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r0   r  )ry   rz   r{   r   r   r   rK   r  r  r   r?  r   ro   r   r   s   @r(   r  r    s     -11515/3,004)-,0/3&*Z
ELL)Z
 !.Z
 !.	Z

 u||,Z
 ELL)Z
  -Z
 &Z
 $D>Z
 'tnZ
 d^Z
 
u//	0Z
 Z
r0   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ef   fd       Z xZS )YosoForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r2   )r   r   r  rp  r!   r   r   r   r   r   r   r  rs  r   s     r(   r   z#YosoForTokenClassification.__init__#  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r0   r   r   r   r   r@  r   r  r   rA  rB  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r?   r   r  )rh   r  r!   r   r  r   r   r  rK   wherer;   ignore_indextype_asr   r   r=  )r   r   r   r   r   r@  r   r  r   rA  rB  r   r[  r  r  r  active_lossactive_logitsactive_labelsr  s                       r(   ro   z"YosoForTokenClassification.forward.  sk   $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r0   r  )ry   rz   r{   r   r   r   rK   r  r  r   r?  r   ro   r   r   s   @r(   r  r  !  s    	  -11515/3,004)-,0/3&*;
ELL);
 !.;
 !.	;

 u||,;
 ELL);
  -;
 &;
 $D>;
 'tn;
 d^;
 
u++	,;
 ;
r0   r  c                   d    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   de	e
ef   fd       Z xZS )YosoForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr>   )
r   r   r  rp  r!   r   r   r   
qa_outputsrs  r   s     r(   r   z!YosoForQuestionAnswering.__init__o  s[      ++f%	))F$6$68I8IJ 	r0   r   r   r   r   r@  r   start_positionsend_positionsr   rA  rB  r   c                    ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r?   rG   )r  r>   )r  start_logits
end_logitsr   r=  )rh   r  r!   r  splitr  rH   rI   clampr   r   r   r=  )r   r   r   r   r   r@  r   r  r  r   rA  rB  r   r[  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r(   ro   z YosoForQuestionAnswering.forward{  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r0   )NNNNNNNNNNN)ry   rz   r{   r   r   r   rK   r  r  r   r?  r   ro   r   r   s   @r(   r  r  m  s   
  -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r0   r  )r  r  r  r  r  r!  rp  r_  )Gr   re   pathlibr   typingr   r   rK   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_yosor   
get_loggerry   r   r-   r/   r<   rC   r]   autogradFunctionr_   r   rn  r   r   r   r  r  r  r!  r0  rI  rN  rX  r_  rp  r  r  r  r  r  r  __all__r}   r0   r(   <module>r     s^      "    A A ! 9  . l l  + 
		H	% 1C.&BU^^,, B>VB// VBt9RYY 9xP		 PhRYY BII Bryy   * :&
")) &
T")) $299 0!bii ! %/ % %2 c
# c
 c
L F
) F
 F
RRYY * N
$7 N
N
b f
/ f
 f
R H
!4 H
 H
V L
2 L
 L
^	r0   