
    rh                        d dl mZmZ d dlmZ d dlZd dlmZ	 d dlZ
d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZmZm Z m!Z!m"Z" d	dl#m$Z$m%Z%m&Z&m'Z' d	dl(m)Z)m*Z*m+Z+ ddl,m-Z-  e+j\                  e/      Z0dZ1dZ2ejf                  Z3d Z4dZ5dZ6 G d dejn                        Z8 G d dejn                        Z9 G d dejn                        Z: G d dejn                        Z; G d dejn                        Z< G d dejn                        Z= G d  d!ejn                        Z> G d" d#ejn                        Z? G d$ d%ejn                        Z@ G d& d'ejn                        ZA G d( d)ejn                        ZB G d* d+ejn                        ZC G d, d-e%      ZD G d. d/ejn                        ZE e)d0e5       G d1 d2eD             ZF e&eFe1ee2        G d3 d4ejn                        ZG e)d5e5       G d6 d7eD             ZH e&eHe1ee2d89        G d: d;ejn                        ZI e)d<e5       G d= d>eD             ZJ e&eJe1e!e2        G d? d@ejn                        ZK e)dAe5       G dB dCeD             ZL e'eLe6j                  dD              e&eLe1ee2        G dE dFejn                        ZN e)dGe5       G dH dIeD             ZO e&eOe1e"e2        G dJ dKejn                        ZP e)dLe5       G dM dNeD             ZQ e&eQe1e e2        G dO dPejn                        ZR e)dQe5       G dR dSeD             ZS e&eSe1ee2       g dTZTy)U    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )	-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )RobertaConfigzFacebookAI/roberta-baser!   c                    | |k7  j                  d      }|j                  dkD  re|j                  d|j                  d   f      }t	        j
                  |d      j                  d      |z  }|j                  | j                        }n)t	        j
                  |d      j                  d      |z  }|j                  d      |z   S )a!  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: jnp.ndarray
        padding_idx: int

    Returns: jnp.ndarray
    i4   r    axis)astypendimreshapeshapejnpcumsum)	input_idspadding_idxmaskincremental_indicess       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/roberta/modeling_flax_roberta.py"create_position_ids_from_input_idsr3   4   s     $,,T2Dyy1}||RB01!jjA6==dCdJ199)//J!jjA6==dCdJ%%d+k99    a   

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _
        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                   | j                        | _        t        j"                  | j                  j$                        | _        y )N)stddev)embedding_initr8   epsilonr8   rate)nnEmbedr7   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger8   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfs    r2   setupzFlaxRobertaEmbeddings.setup   sJ   !xxKK""KK##66..55T[[=Z=Z5[**	 
 $&88KK//KK##66..55T[[=Z=Z5[**	$
  &(XXKK''KK##66..55T[[=Z=Z5[**	&
" dkk.H.HPTPZPZ[zzt{{'F'FGr4   deterministicc                    | j                  |j                  d            }| j                  |j                  d            }| j                  |j                  d            }||z   |z   }	| j	                  |	      }	| j                  |	|      }	|	S )Nr#   rU   )rH   r(   rJ   rL   rM   rQ   )
rS   r.   token_type_idsposition_idsattention_maskrU   inputs_embedsposition_embedsrL   hidden_statess
             r2   __call__zFlaxRobertaEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &(==O }5]-Pr4   NT)__name__
__module____qualname____doc__r!   __annotations__r,   float32r8   rT   boolr^    r4   r2   r6   r6      s0    Q{{E399"H,_c r4   r6   c                       e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
d Zd Zej                  d        Z	 	 	 	 dd
eej"                     dedefdZy	)FlaxRobertaSelfAttentionr7   Fcausalr8   c                 6   | j                   j                  | j                   j                  z  | _        | j                   j                  | j                   j                  z  dk7  rt	        d      t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        | j                  r>t!        t#        j$                  d| j                   j&                  fd      d      | _        y y )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads}r8   kernel_initr    rf   r8   )r7   rC   num_attention_headshead_dim
ValueErrorr@   Denser8   rD   rE   rF   rG   querykeyvaluerj   r	   r,   onesrI   causal_maskrR   s    r2   rT   zFlaxRobertaSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r4   c                     |j                  |j                  d d | j                  j                  | j                  fz         S Nr$   )r*   r+   r7   ro   rp   rS   r]   s     r2   _split_headsz%FlaxRobertaSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr4   c                 n    |j                  |j                  d d | j                  j                  fz         S ry   )r*   r+   r7   rC   rz   s     r2   _merge_headsz%FlaxRobertaSelfAttention._merge_heads   s2    $$]%8%8!%<@W@W?Y%YZZr4   c                 (   | j                  dd      }| j                  ddt        j                  |j                  |j
                        }| j                  ddt        j                  |j                  |j
                        }| j                  ddd       }|r|j                  j                  ^ }	}
}}|j                  }dt        |	      z  |ddfz   }t        j                  |j                  ||      }t        j                  |j                  ||      }||_        ||_        |j                  d   }|j                  |z   |_        t        j                  t        j                  |
      ||z   k  t        |	      d||
fz         }t        ||      }|||fS )	a\  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  L    t        j                  dt         j                        S )Nr   rn   )r,   arrayint32rg   r4   r2   <lambda>z@FlaxRobertaSelfAttention._concatenate_to_cache.<locals>.<lambda>   s    CIIaWZW`W`Da r4   )r   r   r    )has_variablevariabler,   zerosr+   r8   ru   lenr   dynamic_update_slicebroadcast_toarangetupler   )rS   rt   ru   rs   rZ   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r2   _concatenate_to_cachez.FlaxRobertaSelfAttention._concatenate_to_cache   sr    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;S'JC,,\-?-?PE"J!&L(-A% + 1 14M MK''

:&5N)NNj!Q(A:$NNH +8^DNE>))r4   Nkey_value_states
init_cacheoutput_attentionsc                 4   |d u}|j                   d   }	| j                  |      }
|r#| j                  |      }| j                  |      }n"| j                  |      }| j                  |      }| j	                  |
      }
| j	                  |      }| j	                  |      }| j
                  r|
j                   d   |j                   d   }}| j                  dd      r[| j                  d   d   }| j                  d   d   j                   d   }t        j                  | j                  dd|dfdd||f      }n| j                  d d d d d |d |f   }t        j                  ||	f|j                   dd  z         }|N| j
                  rBt        j                  t        j                  |d      j                         }t        ||      }n(| j
                  r}n|t        j                  |d      }| j
                  r,| j                  dd      s|r| j                  |||
|      \  }}}|t        j                   |dkD  t        j"                  |j                   d      j%                  | j&                        t        j"                  |j                   t        j(                  | j&                        j*                        j%                  | j&                              }nd }d }|s*| j,                  j.                  dkD  r| j1                  d	      }t3        |
|||| j,                  j.                  d
|| j&                  d 	      }|t        j4                  d||      }t        j4                  d||      }|j7                  |j                   d d dz         }|r||f}|S |f}|S )Nr   r    r   r   r   )r&   g        rQ   T)biasdropout_rngdropout_ratebroadcast_dropoutrU   r8   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr$   )r%   )r+   rs   rt   ru   r{   rj   r   	variablesr   dynamic_slicerw   r,   r   expand_dimsr   r   selectfullr(   r8   finfominr7   attention_probs_dropout_probmake_rngr   einsumr*   )rS   r]   rZ   layer_head_maskr   r   rU   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrw   attention_biasr   attn_weightsattn_outputoutputss                          r2   r^   z!FlaxRobertaSelfAttention.__call__   sk    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*L  ,7!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|D
7;7Q7QL,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr4   NFTF)r`   ra   rb   r!   rd   rj   rf   r,   re   r8   rT   r{   r}   r@   compactr   r   ndarrayr^   rg   r4   r2   ri   ri      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _r4   ri   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxRobertaSelfOutputr7   r8   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                  | j                        | _
        t        j                  | j                  j                        | _        y )Nrm   r8   r<   r>   )r@   rr   r7   rC   rD   rE   rF   rG   r8   denserM   rN   rO   rP   rQ   rR   s    r2   rT   zFlaxRobertaSelfOutput.setupf  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr4   rU   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S NrW   r   rQ   rM   )rS   r]   input_tensorrU   s       r2   r^   zFlaxRobertaSelfOutput.__call__o  s;    

=1]-P}|'CDr4   Nr_   r`   ra   rb   r!   rd   r,   re   r8   rT   rf   r^   rg   r4   r2   r   r   b  s,    {{E399"H4 r4   r   c                   x    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 d	defdZy)
FlaxRobertaAttentionr7   Frj   r8   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y )Nrj   r8   rn   )ri   r7   rj   r8   rS   r   outputrR   s    r2   rT   zFlaxRobertaAttention.setup|  s7    ,T[[TXT^T^_	+DKKtzzJr4   Nr   c           	          | j                  |||||||      }|d   }	| j                  |	||      }|f}
|r	|
|d   fz  }
|
S )N)r   r   r   rU   r   r   rW   r    )rS   r   )rS   r]   rZ   r   r   r   rU   r   attn_outputsr   r   s              r2   r^   zFlaxRobertaAttention.__call__  sl     yy+-!'/ ! 
 #1oKm\ "Q))Gr4   r   )r`   ra   rb   r!   rd   rj   rf   r,   re   r8   rT   r^   rg   r4   r2   r   r   w  sG    FD{{E399"K "'  r4   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxRobertaIntermediater7   r8   c                 4   t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        | j                  j                     | _        y Nr   )r@   rr   r7   intermediate_sizerD   rE   rF   rG   r8   r   r   
hidden_act
activationrR   s    r2   rT   zFlaxRobertaIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r4   c                 J    | j                  |      }| j                  |      }|S N)r   r   rz   s     r2   r^   z FlaxRobertaIntermediate.__call__  s$    

=16r4   N
r`   ra   rb   r!   rd   r,   re   r8   rT   r^   rg   r4   r2   r   r     s$    {{E399"9r4   r   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxRobertaOutputr7   r8   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                        | _        t        j                  | j                  j                  | j                        | _        y )Nr   r>   r<   )r@   rr   r7   rC   rD   rE   rF   rG   r8   r   rO   rP   rQ   rM   rN   rR   s    r2   rT   zFlaxRobertaOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r4   rU   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S r   r   )rS   r]   attention_outputrU   s       r2   r^   zFlaxRobertaOutput.__call__  s<    

=1]-P}7G'GHr4   Nr_   r   rg   r4   r2   r   r     s,    {{E399"\t r4   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 dde	ej                     de	ej                     deded	ef
d
Zy)FlaxRobertaLayerr7   r8   c                    t        | j                  | j                  j                  | j                        | _        t        | j                  | j                        | _        t        | j                  | j                        | _        | j                  j                  r(t        | j                  d| j                        | _
        y y )Nr   rn   F)r   r7   
is_decoderr8   	attentionr   intermediater   r   add_cross_attentioncrossattentionrR   s    r2   rT   zFlaxRobertaLayer.setup  s    -dkk$++BXBX`d`j`jk3DKKtzzR'4::F;;**"6t{{5X\XbXb"cD +r4   Nencoder_hidden_statesencoder_attention_maskr   rU   r   c	                     | j                  ||||||      }	|	d   }
|| j                  |
|||||      }|d   }
| j                  |
      }| j                  ||
|      }|f}|r||	d   fz  }|	|d   fz  }|S )N)r   r   rU   r   r   )rZ   r   r   rU   r   rW   r    )r   r   r   r   )rS   r]   rZ   r   r   r   r   rU   r   attention_outputsr   cross_attention_outputsr   s                r2   r^   zFlaxRobertaLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;M3CS`a ")!,..G$03A688r4   )NNFTF)r`   ra   rb   r!   rd   r,   re   r8   rT   r   r   rf   r^   rg   r4   r2   r   r     sz    {{E399"d 8<8< ""'+
  (4+ !) 5+ + +  +r4   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxRobertaLayerCollectionr7   r8   Fgradient_checkpointingc           	         | j                   rjt        t        d      }t        | j                  j
                        D cg c]*  } || j                  t        |      | j                        , c}| _        y t        | j                  j
                        D cg c]-  }t        | j                  t        |      | j                        / c}| _        y c c}w c c}w )N)         )static_argnums)namer8   )	r   rematr   ranger7   num_hidden_layersstrr8   layers)rS   FlaxRobertaCheckpointLayeris      r2   rT   z FlaxRobertaLayerCollection.setup	  s    &&)./?PY)Z& t{{<<= +4;;SV4::VDK t{{<<= !3q6LDK
s   /C2CNr   r   r   rU   r   output_hidden_statesreturn_dictc                    |rdnd }|	rdnd }|r|dnd }|W|j                   d   t        | j                        k7  r2t        dt        | j                         d|j                   d    d      t	        | j                        D ]@  \  }}|	r||fz  } ||||||   nd |||||      }|d   }|s,||d   fz  }|8||d   fz  }B |	r||fz  }||||f}|
st        d |D              S t        ||||	      S )
Nrg   r   z&The head_mask should be specified for z/ layers, but it is for                         .r    r$   c              3   &   K   | ]	  }||  y wr   rg   ).0vs     r2   	<genexpr>z6FlaxRobertaLayerCollection.__call__.<locals>.<genexpr>L  s     =qq}=s   )last_hidden_stater]   
attentionscross_attentions)r+   r   r   rq   	enumerater   r   )rS   r]   rZ   	head_maskr   r   r   rU   r   r   r   all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   s                     r2   r^   z#FlaxRobertaLayerCollection.__call__  sl     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++. 	@HAu#!m%55!! ) 5	!4%&!	M *!,M =#3"55(4(]1-=,??(+	@.  -!11 "3^EYZ=G===<++%1	
 	
r4   NNFTFFTr`   ra   rb   r!   rd   r,   re   r8   r   rf   rT   r   r   r^   rg   r4   r2   r   r     s    {{E399"#(D($ 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
r4   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxRobertaEncoderr7   r8   Fr   c                 f    t        | j                  | j                  | j                        | _        y )Nr8   r   )r   r7   r8   r   r  rR   s    r2   rT   zFlaxRobertaEncoder.setup\  s%    /KK**#'#>#>

r4   Nr   r   r   rU   r   r   r   c                 8    | j                  |||||||||	|

      S )N)r  r   r   r   rU   r   r   r   )r  )rS   r]   rZ   r  r   r   r   rU   r   r   r   s              r2   r^   zFlaxRobertaEncoder.__call__c  s8     zz"7#9!'/!5#  
 	
r4   r  r  rg   r4   r2   r  r  W  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
r4   r  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxRobertaPoolerr7   r8   c                     t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        y r   )
r@   rr   r7   rC   rD   rE   rF   rG   r8   r   rR   s    r2   rT   zFlaxRobertaPooler.setup  sH    XXKK##++224;;3P3PQ**

r4   c                 `    |d d df   }| j                  |      }t        j                  |      S )Nr   )r   r@   tanh)rS   r]   cls_hidden_states      r2   r^   zFlaxRobertaPooler.__call__  s1    (A.::&67ww'((r4   Nr   rg   r4   r2   r  r    s$    {{E399"
)r4   r  c                       e Zd ZU eed<   ej                  Zej                  ed<   ej                  j                  j                  Zedej                  f   ed<   d ZddZy)	FlaxRobertaLMHeadr7   r8   .	bias_initc                    t        j                  | j                  j                  | j                  t
        j                   j                  j                  | j                  j                              | _	        t        j                  | j                  j                  | j                        | _        t        j                  | j                  j                  | j                  dt
        j                   j                  j                  | j                  j                              | _        | j                  d| j                   | j                  j                  f      | _        y )Nrl   r<   F)r8   use_biasrm   r   )r@   rr   r7   rC   r8   rD   rE   rF   rG   r   rM   rN   
layer_normrB   decoderparamr  r   rR   s    r2   rT   zFlaxRobertaLMHead.setup  s    XXKK##**++224;;3P3PQ


 ,,t{{/I/IQUQ[Q[\xxKK""**++224;;3P3PQ	
 JJvt~~8N8N7PQ	r4   Nc                 @   | j                  |      }t        d   |      }| j                  |      }|+| j                  j	                  dd|j
                  ii|      }n| j                  |      }t        j                  | j                  | j                        }||z  }|S )Ngeluparamskernel)
r   r   r   r!  applyTr,   asarrayr   r8   )rS   r]   shared_embeddingr   s       r2   r^   zFlaxRobertaLMHead.__call__  s    

=1v}56' LL..8EUEWEW:X/Y[hiM LL7M{{499djj1r4   r   )r`   ra   rb   r!   rd   r,   re   r8   rD   r@   rE   r   r  r   npr   rT   r^   rg   r4   r2   r  r    sL    {{E399"+.66+>+>+D+DIxRZZ(DRr4   r  c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxRobertaClassificationHeadr7   r8   c                    t        j                  | j                  j                  | j                  t
        j                   j                  j                  | j                  j                              | _	        | j                  j                  | j                  j                  n| j                  j                  }t        j                  |      | _        t        j                  | j                  j                  | j                  t
        j                   j                  j                  | j                  j                              | _        y )Nrl   r>   )r@   rr   r7   rC   r8   rD   rE   rF   rG   r   classifier_dropoutrP   rO   rQ   
num_labelsout_projrS   r/  s     r2   rT   z#FlaxRobertaClassificationHead.setup  s    XXKK##**++224;;3P3PQ

 {{--9 KK**00 	
 zz'9:KK""**++224;;3P3PQ
r4   c                     |d d dd d f   }| j                  ||      }| j                  |      }t        j                  |      }| j                  ||      }| j	                  |      }|S )Nr   rW   )rQ   r   r@   r  r1  )rS   r]   rU   s      r2   r^   z&FlaxRobertaClassificationHead.__call__  sf    %aAg.]-P

=1.]-Pm4r4   Nr_   r   rg   r4   r2   r-  r-    s$    {{E399"
$r4   r-  c                       e Zd ZU dZeZdZdZej                  e
d<   ddej                  ddfd	ed
ededej                  dedef fdZd Zddej(                  j*                  d
ededefdZd Z eej7                  d            	 	 	 	 	 	 	 	 	 	 	 	 	 ddee   dej(                  j*                  dedee   dee   dee   dee   fd       Z xZS ) FlaxRobertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    robertaNmodule_class)r    r    r   TFr7   input_shapeseedr8   _do_initr   c                 \     | j                   d|||d|}t        	| 	  ||||||       y )Nr7   r8   r   )r8  r9  r8   r:  rg   )r7  super__init__)
rS   r7   r8  r9  r8   r:  r   kwargsmodule	__class__s
            r2   r>  z#FlaxRobertaPreTrainedModel.__init__  sA     #""w&Vlwpvw[tSXcklr4   c                 ^    | j                  | j                  | j                  d      | _        y )NTr<  )r7  r7   r8   _modulerR   s    r2   enable_gradient_checkpointingz8FlaxRobertaPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r4   rngr%  returnc                    t        j                  |d      }t        j                  |      }t        || j                  j
                        }t        j                  |      }t        j                  | j                  j                  | j                  j                  f      }t        j                  j                  |      \  }	}
|	|
d}| j                  j                  rTt        j                  || j                  j                  fz         }|}| j                  j                  ||||||||d	      }n"| j                  j                  ||||||d      }|d   }|dt!        t#        |            }t!        t#        |            }| j$                  D ]
  }||   ||<    t'               | _        t)        t+        |            S |S )Nr#   rn   )r%  rQ   F)r   r%  )r,   r   	ones_liker3   r7   pad_token_idrv   r   ro   rD   randomsplitr   rC   r@  initr   r   _missing_keyssetr   r   )rS   rE  r8  r%  r.   rX   rY   rZ   r  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r2   init_weightsz'FlaxRobertaPreTrainedModel.init_weights  s   IIk6	y19)T[[E]E]^y1HHdkk;;T[[=\=\]^	"%**"2"23"7
K$=;;**$'IIkT[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2iyfk #3 # ,H5(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r4   c                    t        j                  ||fd      }t        j                  |d      }t        j                  t        j                  t        j
                  |      j                  d         |j                        }| j                  j                  t        j                  j                  d      |||dd      }t        |d         S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        r#   rn   r%   r   FT)r   r   r   )r,   rv   rH  r   r   
atleast_2dr+   r@  rL  rD   rJ  PRNGKeyr   )rS   r   r   r.   rZ   rY   init_variabless          r2   r   z%FlaxRobertaPreTrainedModel.init_cache  s     HHj*5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9nlX]jn * 
 w/00r4   zbatch_size, sequence_lengthr   trainr   r   r   past_key_valuesc                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        j
                  |      }| t        || j                   j                        }|t	        j                  |      }|?t	        j                  | j                   j                  | j                   j                  f      }i }|	|	|d<   d|xs | j                  i}| j                   j                  r|r	||d<   dg}nd}| j                  j                  |t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      |||
 |||||      }||r|\  }}t#        |d         |d<   |S |"|s |\  }}|d d	 t#        |d         fz   |d	d  z   }|S | j                  j                  |t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      |
 ||||
      }|S )NrQ   r%  r   Fr#   rn   )rX   rY   r  r   r   rU   r   r   r   rP  mutablerZ  r    )rX   rY   r  rU   r   r   r   rP  )r7   r   r   r   r,   
zeros_liker3   rI  rH  rv   r   ro   r%  r   r@  r'  r   r   )rS   r.   rZ   rX   rY   r  r   r   r%  r   rY  r   r   r   rZ  rP  inputsr\  r   s                      r2   r^   z#FlaxRobertaPreTrainedModel.__call__1  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N=iIaIabL! ]]95N$++"?"?A`A`!abI ")DOF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r4   r   )NNNNNNNNFNNNN) r`   ra   rb   rc   r!   config_classbase_model_prefixr7  r@   Modulerd   r,   re   r   intr8   rf   r>  rD  rD   rJ  rW  r   rT  r   r   ROBERTA_INPUTS_DOCSTRINGformatr   dictr^   __classcell__)rA  s   @r2   r5  r5    sw   
 !L!"L"))"
 $;;',mm m 	m
 yym m !%m
(!

 2 2 (! (!PZ (!fp (!V1& ++C+J+JKh+ij "#!%*.,0/3&**.^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ "$^ k^r4   r5  c                   8   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   dZ
e	ed<   d Z	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     de	de	de	de	de	fdZy)FlaxRobertaModuler7   r8   Tadd_pooling_layerFr   c                     t        | j                  | j                        | _        t	        | j                  | j                  | j
                        | _        t        | j                  | j                        | _        y )Nrn   r  )	r6   r7   r8   
embeddingsr  r   encoderr  poolerrR   s    r2   rT   zFlaxRobertaModule.setup  sS    /4::N)KK**#'#>#>

 (4::Fr4   NrX   rY   r  r   r   r   rU   r   r   r   c                    |t        j                  |      }|St        j                  t        j                  t        j                  |      j
                  d         |j
                        }| j                  |||||	      }| j                  ||||	||||
||
      }|d   }| j                  r| j                  |      nd }|s|	|f|dd  z   S ||f|dd  z   S t        |||j                  |j                  |j                        S )Nr%   rW   )r  rU   r   r   r   r   r   r   r   r    )r  pooler_outputr]   r  r  )r,   r]  r   r   rV  r+   rk  rl  ri  rm  r   r]   r  r  )rS   r.   rZ   rX   rY   r  r   r   r   rU   r   r   r   r]   r   pooleds                   r2   r^   zFlaxRobertaModule.__call__  s,     ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL~|^S` ( 
 ,,'"7#9!/!5#  
  
/3/E/E]+4~%''!"+55!6*WQR[88?+ !//))$55
 	
r4   )
NNNNNFTFFT)r`   ra   rb   r!   rd   r,   re   r8   ri  rf   r   rT   r   r   r^   rg   r4   r2   rh  rh    s    {{E399""t"#(D(G 15.2+/7;8< ""'%* 5
 !-	5

 s{{+5
 CKK(5
  (45
 !) 55
 5
 5
  5
 #5
 5
r4   rh  zaThe bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxRobertaModelN)r`   ra   rb   rh  r7  rg   r4   r2   rr  rr    s	    
 %Lr4   rr  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)FlaxRobertaForMaskedLMModuler7   r8   Fr   c                     t        | j                  d| j                  | j                        | _        t        | j                  | j                        | _        y NF)r7   ri  r8   r   r7   r8   rh  r7   r8   r   r6  r  lm_headrR   s    r2   rT   z"FlaxRobertaForMaskedLMModule.setup  @    (;;#**#'#>#>	
 )4::Nr4   rU   r   r   r   c
                 6   | j                  |||||||||		      }
|
d   }| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  ||      }|	s	|f|
dd  z   S t        ||
j                  |
j                  	      S )
NrU   r   r   r   r   r%  rk  rH   	embeddingr*  r    logitsr]   r  )r6  r7   tie_word_embeddingsr   ry  r   r]   r  )rS   r.   rZ   rX   rY   r  rU   r   r   r   r   r]   r*  r  s                 r2   r^   z%FlaxRobertaForMaskedLMModule.__call__  s     ,,'/!5#  

  
;;**#||55h?MN_`alm# m>NO9wqr{**!!//))
 	
r4   NTFFTr`   ra   rb   r!   rd   r,   re   r8   r   rf   rT   r^   rg   r4   r2   rt  rt    sf    {{E399"#(D(O  #"'%* )
 )
  )
 #)
 )
r4   rt  z5RoBERTa Model with a `language modeling` head on top.c                       e Zd ZeZy)FlaxRobertaForMaskedLMN)r`   ra   rb   rt  r7  rg   r4   r2   r  r     s    /Lr4   r  z<mask>)r0   c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)*FlaxRobertaForSequenceClassificationModuler7   r8   Fr   c                     t        | j                  | j                  d| j                        | _        t        | j                  | j                        | _        y )NFr7   r8   ri  r   rw  )rh  r7   r8   r   r6  r-  
classifierrR   s    r2   rT   z0FlaxRobertaForSequenceClassificationModule.setup3  sC    (;;**##'#>#>	
 8t{{RVR\R\]r4   rU   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }|	s	|f|
dd  z   S t        ||
j                  |
j                        S Nr|  r   rW   r    r  )r6  r  r   r]   r  )rS   r.   rZ   rX   rY   r  rU   r   r   r   r   sequence_outputr  s                r2   r^   z3FlaxRobertaForSequenceClassificationModule.__call__<  s     ,,'/!5#  

 "!*N9wqr{**+!//))
 	
r4   Nr  r  rg   r4   r2   r  r  .  sf    {{E399"#(D(^  #"'%* #
 #
  #
 ##
 #
r4   r  z
    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd ZeZy)$FlaxRobertaForSequenceClassificationN)r`   ra   rb   r  r7  rg   r4   r2   r  r  b  s	     >Lr4   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)"FlaxRobertaForMultipleChoiceModuler7   r8   Fr   c                    t        | j                  | j                  | j                        | _        t        j                  | j                  j                        | _        t        j                  d| j                        | _
        y )Nr<  r>   r    rn   )rh  r7   r8   r   r6  r@   rO   rP   rQ   rr   r  rR   s    r2   rT   z(FlaxRobertaForMultipleChoiceModule.setup{  sW    (;;**#'#>#>

 zzt{{'F'FG((1DJJ7r4   rU   r   r   r   c
                 <   |j                   d   }
||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }| j                  |||||||||		      }|d   }| j                  ||      }| j	                  |      }|j                  d|
      }|	s	|f|dd  z   S t        ||j                  |j                        S )Nr    r%   r|  rW   r$   r  )r+   r*   r6  rQ   r  r   r]   r  )rS   r.   rZ   rX   rY   r  rU   r   r   r   num_choicesr   pooled_outputr  reshaped_logitss                  r2   r^   z+FlaxRobertaForMultipleChoiceModule.__call__  sH     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ,,'/!5#  

  
]-P/ ..[9#%33,"!//))
 	
r4   Nr  r  rg   r4   r2   r  r  v  se    {{E399"#(D(8  #"'%* ,
 ,
  ,
 #,
 ,
r4   r  z
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZy)FlaxRobertaForMultipleChoiceN)r`   ra   rb   r  r7  rg   r4   r2   r  r    s	     6Lr4   r  z(batch_size, num_choices, sequence_lengthc            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)'FlaxRobertaForTokenClassificationModuler7   r8   Fr   c                    t        | j                  | j                  d| j                        | _        | j                  j
                  | j                  j
                  n| j                  j                  }t        j                  |      | _	        t        j                  | j                  j                  | j                        | _        y )NFr  r>   rn   )rh  r7   r8   r   r6  r/  rP   r@   rO   rQ   rr   r0  r  r2  s     r2   rT   z-FlaxRobertaForTokenClassificationModule.setup  s    (;;**##'#>#>	
 {{--9 KK**00 	
 zz'9:((4;;#9#9Lr4   rU   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j
                        S r  )r6  rQ   r  r   r]   r  )rS   r.   rZ   rX   rY   r  rU   r   r   r   r   r]   r  s                r2   r^   z0FlaxRobertaForTokenClassificationModule.__call__  s     ,,'/!5#  

  
]-P/9wqr{**(!//))
 	
r4   Nr  r  rg   r4   r2   r  r    sf    {{E399"#(D(M, #"'%* $
 $
  $
 #$
 $
r4   r  z
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZy)!FlaxRobertaForTokenClassificationN)r`   ra   rb   r  r7  rg   r4   r2   r  r    s	     ;Lr4   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)%FlaxRobertaForQuestionAnsweringModuler7   r8   Fr   c                     t        | j                  | j                  d| j                        | _        t        j                  | j                  j                  | j                        | _        y )NFr  rn   )	rh  r7   r8   r   r6  r@   rr   r0  
qa_outputsrR   s    r2   rT   z+FlaxRobertaForQuestionAnsweringModule.setup  sJ    (;;**##'#>#>	
 ((4;;#9#9Lr4   rU   r   r   r   c
                 b   | j                  |||||||||		      }
|
d   }| j                  |      }t        j                  || j                  j
                  d      \  }}|j                  d      }|j                  d      }|	s
||f|
dd  z   S t        |||
j                  |
j                        S )Nr|  r   r%   r&   r    )start_logits
end_logitsr]   r  )
r6  r  r,   rK  r7   r0  squeezer   r]   r  )rS   r.   rZ   rX   rY   r  rU   r   r   r   r   r]   r  r  r  s                  r2   r^   z.FlaxRobertaForQuestionAnsweringModule.__call__'  s     ,,'/!5#  

  
/#&99VT[[5K5KRT#U j#++B/''+
 *-;;/%!!//))	
 	
r4   Nr  r  rg   r4   r2   r  r    sf    {{E399"#(D(M  #"'%* (
 (
  (
 #(
 (
r4   r  z
    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZy)FlaxRobertaForQuestionAnsweringN)r`   ra   rb   r  r7  rg   r4   r2   r  r  R  s	     9Lr4   r  c                   
   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     de	de	de	de	de	fdZy)FlaxRobertaForCausalLMModuler7   r8   Fr   c                     t        | j                  d| j                  | j                        | _        t        | j                  | j                        | _        y rv  rx  rR   s    r2   rT   z"FlaxRobertaForCausalLMModule.setupj  rz  r4   NrX   r  r   r   r   rU   r   r   r   c                 R   | j                  |||||||||	|
||      }|d   }| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  ||      }|s	|f|dd  z   S t        ||j                  |j                  |j                  	      S )
N)r   r   r   rU   r   r   r   r   r%  rk  rH   r}  r~  r    )r  r]   r  r  )	r6  r7   r  r   ry  r   r]   r  r  )rS   r.   rZ   rY   rX   r  r   r   r   rU   r   r   r   r   r]   r*  r  s                    r2   r^   z%FlaxRobertaForCausalLMModule.__call__s  s      ,,"7#9!'/!5#  
  
;;**#||55h?MN_`alm# m>NO9wqr{**4!//))$55	
 	
r4   )	NNNNFTFFTr  rg   r4   r2   r  r  e  s    {{E399"#(D(O 15+/7;8< ""'%* 0

 !-0
 CKK(0
  (40
 !) 50
 0
 0
  0
 #0
 0
r4   r  z
    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   >    e Zd ZeZddeej                     fdZd Z	y)FlaxRobertaForCausalLMNrZ   c                 H   |j                   \  }}| j                  ||      }t        j                  ||fd      }|-|j	                  d      dz
  }t        j                  ||d      }n4t        j                  t        j                  |d      d d d f   ||f      }|||dS )Nr#   rn   r%   r&   r    )r   r   )rZ  rZ   rY   )	r+   r   r,   rv   r-   r   r   r   r   )	rS   r.   r   rZ   r   
seq_lengthrZ  extended_attention_maskrY   s	            r2   prepare_inputs_for_generationz4FlaxRobertaForCausalLM.prepare_inputs_for_generation  s    !*
J//*jA #&((J
+C4"P%)00b09A=L&)&>&>?VXfhn&o#++CJJz,NtUVw,WZdfpYqrL  /5(
 	
r4   c                 L    |j                   |d<   |d   d d dd f   dz   |d<   |S )NrZ  rY   r%   r    )rZ  )rS   model_outputsmodel_kwargss      r2   update_inputs_for_generationz3FlaxRobertaForCausalLM.update_inputs_for_generation  s8    *7*G*G&''3N'CArsF'Ka'O^$r4   r   )
r`   ra   rb   r  r7  r   rD   Arrayr  r  rg   r4   r2   r  r    s'     0L
S[\_\e\eSf 
*r4   r  )r  r  r  r  r  r  rr  r5  )Utypingr   r   
flax.linenlinenr@   rD   	jax.numpynumpyr,   r+  flax.core.frozen_dictr   r   r   r   r	   r
   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_robertar!   
get_loggerr`   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r3   ROBERTA_START_DOCSTRINGrc  ra  r6   ri   r   r   r   r   r   r   r  r  r  r-  r5  rh  rr  rt  r  r  r  r  r  rd  r  r  r  r  r  r  __all__rg   r4   r2   <module>r     s   &  
   > > 6 6 > ; 
 
 
 w v Y Y 0 
		H	%/ !:0 .# N(BII (Xhryy hXBII *'299 'Vbii &		 *6ryy 6tO
 O
f$
 $
P)		 )" 		  FBII @}!4 }BD
		 D
N g%1 %	% -/BDbds t7
299 7
t QSjk07 0 l0 "	1
 1
h  >+E >> ( 	:
 :
z  6#= 66  ":"A"ABl"m  !	8
bii 8
v  ;(B ;; %	6
BII 6
r  9&@ 99 #$	>
299 >
B  7 < )		r4   