
    rh?                     \    d dl mZmZmZ ddlmZmZ  G d de      Z G d de      ZddgZ	y)	    )AnyOptionalUnion   )PretrainedConfiglayer_type_validationc                        e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )T5GemmaModuleConfigaH  
    This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the T5GemmaModule-7B.
    e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5GemmaModuleModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
            scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
            scaling factor when applying tanh softcapping on the attention scores.

    ```python
    >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
    >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
    >>> configuration = T5GemmaModuleConfig()
    >>> # Initializing a model from the t5_gemma_module-7b style configuration
    >>> model = T5GemmaModuleModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```t5_gemma_modulepast_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                    t        |   d||||d| || _        |	| _        || _        || _        || _        || _        || _        || _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | j*                  ;t-        | j                        D cg c]  }t/        |dz   dz        rdnd c}| _        t1        | j*                         y c c}w )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings      sliding_attentionfull_attention )super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesrangeboolr   )selfr"   r$   r%   r&   r'   r)   r(   r0   r#   r*   r+   r,   r   r   r   r   r-   r.   r/   r1   r2   r5   r3   r4   kwargsi	__class__s                              /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/t5gemma/configuration_t5gemma.pyr!   zT5GemmaModuleConfig.__init__y   s&   8 	 	
%%% 3		

 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#&#X]^b^t^tXu STtQUaK'8#>NN D 	d../ s   C?)  i 	  i $              gelu_pytorch_tanhi    g{Gz?gư>Tr   r   r   Tg     @F        rA   i   Ng      >@g      I@)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr!   __classcell__r;   s   @r<   r
   r
      s    JX #J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 - $ ! $#3<0 <0    r
   c                   ^    e Zd ZdZdZdgZi dddddddd	d
ddddd	dddddddd	dddddddd	dddddd	iZdgdgfddgdgfdgdgfdgdgfddgdgfdgdgfdZ	 	 	 	 	 	 	 	 d(dee	e
eeef   f      dee	e
eeef   f      ded ed!ed"ed#ed$ef fd%Z fd&Zd)d'Z xZS )*T5GemmaConfiga  
    This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
    >>> model = T5GemmaModel(t5gemma_config)
    ```
    Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PretrainedConfig] for more information.
    Args:
        encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether tie input and output embeddings.
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5Gemma model (the same as Gemma 2).
        kwargs (additional keyword arguments, optional, *optional*):
            Will be passed to the PretrainedConfig base class.
    t5gemmar   z!encoder.layers.*.self_attn.q_projr   z!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.o_projr   zencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projz!decoder.layers.*.self_attn.q_projz!decoder.layers.*.self_attn.k_projz!decoder.layers.*.self_attn.v_projz!decoder.layers.*.self_attn.o_projz"decoder.layers.*.cross_attn.q_projz"decoder.layers.*.cross_attn.k_projz"decoder.layers.*.cross_attn.v_projz"decoder.layers.*.cross_attn.o_projzdecoder.layers.*.mlp.gate_projzdecoder.layers.*.mlp.up_projzdecoder.layers.*.mlp.down_projr   r   r   r   )zencoder.embed_tokenszencoder.layerszencoder.normzdecoder.embed_tokenszdecoder.layerszdecoder.normencoderdecoderis_encoder_decoderdropout_rateclassifier_dropout_rater/   r   r"   c	                 V   t        |t              rt        di |}n0|t               }n#t        |t              sJ t        |       d       t        |t              rt        di |}n(||}n#t        |t              sJ t        |       d       t        di |j	                         }t        di |j	                         }d|_        ||_        ||_        || _        d|_        d|_	        ||_        ||_        |j                  |_        || _        dD ]  }
|
|	vst        ||
      |	|
<    t        | <  di |	 || _        |	j#                  d|j                        | _	        |	j#                  d|j$                        | _        || _        || _        || _        || _        || _        y )Nz is not supported.FT)r   r   r   r,   r*   r   )
isinstancedictr
   typeto_dict
is_decoderrU   r/   rR   r,   r$   cross_attention_hidden_sizerS   getattrr    r!   rT   getr*   rV   r   r"   )r8   rR   rS   rT   rU   rV   r/   r   r"   r9   special_token_keyr;   s              r<   r!   zT5GemmaConfig.__init__   s    gt$)4G4G_)+Gg':;aWN`=aa;gt$)4G4G_Gg':;aWN`=aa;%:(9:%:(9:"+$5!! +$5!.5.A.A+!Q 	P .,3G=N,O()	P 	"6""4K1B1BC!',?AZAZ![(!2'>$#6  %rN   c                     g d}||v r.t        | j                  ||       t        | j                  ||       t        |   ||       y )N)output_hidden_statesoutput_attentions_attn_implementationrU   r/   r"   )setattrrR   rS   r    __setattr__)r8   keyvalueshared_attr_with_submodulesr;   s       r<   rf   zT5GemmaConfig.__setattr__7  sE    '
# --DLL#u-DLL#u-C'rN   c                     ~| S )Nr   )r8   rS   s     r<   get_text_configzT5GemmaConfig.get_text_configF  s
    rN   )NNTrC   rC   rC   Tr=   )F)rD   rE   rF   rG   rH   rI   rJ   rK   r   r   r
   rY   r   r7   floatintr!   rf   rk   rL   rM   s   @r<   rP   rP      s   B J#4"5+Y 	,Y 	,Y	
 	,Y 	)) 	'	 	)) 	,Y 	,Y 	,Y 	,Y 	-i 	-i  	-i!" 	-i#$ 	))%& 	'	'( 	)))0 #.0A B+-=>@QR)*_,=>"-0A B+-=>@QR)*_,=>	 IMHL#'!),#&$( 8%% 3T#s(^ CDE8% % 3T#s(^ CDE8% !	8%
 8% "'8% !8% "8% 8%t(rN   rP   N)
typingr   r   r   configuration_utilsr   r   r
   rP   __all__r   rN   r<   <module>rq      s=   , ( ' JZ0* Z0zQ$ Qh 1
2rN   