
    rhNA                        d Z ddlmZmZmZ ddlZ	 ddlmZ ddlmZ  eej                  j                   d      r!ej                  j                   j"                  Znej                  j                   Z G d d	ej$                        Z	 	 	 	 	 	 	 	 	 dd
edededededededee   dee   dededeee      fdZ G d de      Z G d d      Zy# e	e
f$ r	 ddlmZ Y w xY w)z?Functions and classes related to optimization (weight updates).    )CallableOptionalUnionN)Adam   )keraslearning_rate_schedulec                   P     e Zd ZdZ	 	 d
dededededee   f
 fdZ	d Z
d	 Z xZS )WarmUpa  
    Applies a warmup schedule on a given learning rate decay schedule.

    Args:
        initial_learning_rate (`float`):
            The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
            of the warmup).
        decay_schedule_fn (`Callable`):
            The schedule function to apply after the warmup for the rest of training.
        warmup_steps (`int`):
            The number of steps for the warmup part of training.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for the polynomial warmup (defaults is a linear warmup).
        name (`str`, *optional*):
            Optional name prefix for the returned tensors during the schedule.
    initial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                 h    t         |           || _        || _        || _        || _        || _        y N)super__init__r   r   r   r   r   )selfr   r   r   r   r   	__class__s         o/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/optimization_tf.pyr   zWarmUp.__init__7   s6     	%:"(
!2	    c                     t        j                   j                  xs d      5 }t        j                  t         j                        }t        j                   j
                  t         j                        }||z  } j                  t         j                  j                  | j                        z  t        j                  ||k  fd fd|      cd d d        S # 1 sw Y   y xY w)Nr   c                       S r    )warmup_learning_rates   r   <lambda>z!WarmUp.__call__.<locals>.<lambda>P   s    , r   c                  @     j                   j                  z
        S r   )r   r   )r   steps   r   r   z!WarmUp.__call__.<locals>.<lambda>Q   s    ..td6G6G/GH r   r   )tf
name_scoper   castfloat32r   r   mathpowr   cond)r   r   r   global_step_floatwarmup_steps_floatwarmup_percent_doner   s   ``    @r   __call__zWarmUp.__call__F   s    ]]49901 	T !#bjj 9!#):):BJJ!G"36H"H#'#=#=L_aeakak@l#l 77!$66,H		 	 	s   B1C""C+c                 v    | j                   | j                  | j                  | j                  | j                  dS )Nr   r   r   r   r   r-   r   s    r   
get_configzWarmUp.get_configU   s5    %)%?%?!%!7!7 --ZZII
 	
r   )      ?N)__name__
__module____qualname____doc__floatr   intr   strr   r+   r/   __classcell__r   s   @r   r   r   %   sS    , "$ $ 	
  sm
r   r   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                     t         j                  | ||z
  | |z  |
      }|rt        | ||      }|	dkD  rt        ||	|||||g d|	      }||fS t        j
                  j                  ||||||      }||fS )a  
    Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

    Args:
        init_lr (`float`):
            The desired learning rate at the end of the warmup phase.
        num_train_steps (`int`):
            The total number of training steps.
        num_warmup_steps (`int`):
            The number of warmup steps.
        min_lr_ratio (`float`, *optional*, defaults to 0):
            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 to use in Adam.
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 to use in Adam.
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon to use in Adam.
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            If not `None`, clip the gradient norm for each weight tensor to this value.
        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
            weight tensors, as if they were concatenated into a single vector.
        weight_decay_rate (`float`, *optional*, defaults to 0):
            The weight decay to use.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for PolynomialDecay.
        include_in_weight_decay (`list[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters except bias and layer norm parameters.
    )r   decay_stepsend_learning_rater   )r   r   r           )	LayerNorm
layer_normbias)	learning_raterC   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayrD   )rL   rM   rN   rO   rP   rQ   )	schedulesPolynomialDecayr   AdamWeightDecayr   
optimizersr   )r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   r   rD   lr_schedule	optimizers                 r   create_optimizerrY   _   s    \ ++%#&66!L0	 , K ")))

 3#%/ "0&I$;

	, k!! $$))% "0 * 
	 k!!r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 ddeeej                  f   dededededede	e
e      d	e	e
e      d
ef fdZe fd       Z fdZd Zd fd	Zd Zd fd	Zd fd	Z fdZd Z xZS )rU   am
  
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://huggingface.co/papers/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://huggingface.co/papers/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`list[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`list[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    rL   rM   rN   rO   amsgradrC   rD   rR   r   c
                 Z    t        |   ||||||	fi |
 || _        || _        || _        y r   )r   r   rC   _include_in_weight_decay_exclude_from_weight_decay)r   rL   rM   rN   rO   r[   rC   rD   rR   r   kwargsr   s              r   r   zAdamWeightDecay.__init__   s:     	$YRXY!2(?%*C'r   c                 6    dt         i}t        | 	  ||      S )z?Creates an optimizer from its config with WarmUp custom object.r   )custom_objects)r   r   from_config)clsconfigra   r   s      r   rb   zAdamWeightDecay.from_config   s$     #F+w"6."IIr   c                 z    t         |   |||       t        j                  | j                  d      |||f   d<   y )Nadam_weight_decay_rater    rC   )r   _prepare_localr!   constantrC   )r   
var_device	var_dtypeapply_stater   s       r   rg   zAdamWeightDecay._prepare_local   s?    z9kBDFKK"")AE
Z+,-@Ar   c                     | j                  |j                        }|rI|j                  ||z  ||j                  |j                  j
                  f   d   z  | j                        S t        j                         S )NrC   )use_locking)	_do_use_weight_decayr   
assign_subdevicedtype
base_dtype_use_lockingr!   no_op)r   varrL   rk   do_decays        r   _decay_weights_opz!AdamWeightDecay._decay_weights_op   sq    ,,SXX6>>#k3::syy?S?S2T&UVi&jj -- "   xxzr   c                 d    t        t        |       \  }}t        |   t        ||      fd|i|S )Nr   )listzipr   apply_gradients)r   grads_and_varsr   r_   gradstvarsr   s         r   r{   zAdamWeightDecay.apply_gradients  s7    C01uw&s5%'8NtNvNNr   c                     || j                   |   i fS |xs i }|j                  ||f      }|| j                  ||      }||||f<   |d   d|ifS )z1Retrieves the learning rate with the given state.lr_trk   )_decayed_lr_tget_fallback_apply_state)r   ri   rj   rk   coefficientss        r   _get_lrzAdamWeightDecay._get_lr  sw    %%i0"44!'R"
I'>?55j)LL3?KY/0F#m[%AAAr   c                    | j                  |j                  |j                  j                  |      \  }}| j	                  |||      }t        j                  |g      5  t        |    ||fi |cd d d        S # 1 sw Y   y xY wr   )	r   rp   rq   rr   rw   r!   control_dependenciesr   _resource_apply_dense)r   gradru   rk   r   r_   decayr   s          r   r   z%AdamWeightDecay._resource_apply_dense  sx    ||CJJ		0D0DkRf&&sD+>$$eW- 	F70sEfE	F 	F 	Fs   A::Bc                    | j                  |j                  |j                  j                  |      \  }}| j	                  |||      }t        j                  |g      5  t        |    |||fi |cd d d        S # 1 sw Y   y xY wr   )	r   rp   rq   rr   rw   r!   r   r   _resource_apply_sparse)	r   r   ru   indicesrk   r   r_   r   r   s	           r   r   z&AdamWeightDecay._resource_apply_sparse  sz    ||CJJ		0D0DkRf&&sD+>$$eW- 	P71$WOO	P 	P 	Ps   A;;Bc                 ^    t         |          }|j                  d| j                  i       |S )NrC   )r   r/   updaterC   )r   rd   r   s     r   r/   zAdamWeightDecay.get_config  s-    #%*D,B,BCDr   c                     | j                   dk(  ry| j                  r| j                  D ]  }||v s y | j                  r| j                  D ]  }||v s y y)z0Whether to use L2 weight decay for `param_name`.r   FT)rC   r]   r^   )r   
param_namers      r   rn   z$AdamWeightDecay._do_use_weight_decay#  sj    !!Q&((22  
?  **44 !
? ! r   )	gMbP??+?gHz>FrH   NNrU   r   )r1   r2   r3   r4   r   r5   rS   LearningRateScheduleboolr   ry   r7   r   classmethodrb   rg   rw   r{   r   r   r   r/   rn   r8   r9   s   @r   rU   rU      s    $P GL#&7;9=%DUI$B$BBCD D 	D
 D D !D "*$s)!4D $,DI#6D D$ J J

OBFP
r   rU   c                   B    e Zd ZdZd Zed        Zed        Zd Zd Z	y)GradientAccumulatoraR  
    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
    c                      g | _         d| _        y)zInitializes the accumulator.N)
_gradients_accum_stepsr.   s    r   r   zGradientAccumulator.__init__@  s     r   c                 0   | j                   qt        j                  t        j                  dt        j                        dt        j
                  j                  t        j                  j                        | _         | j                   j                         S )zNumber of accumulated steps.r   )rq   F	trainablesynchronizationaggregation)
r   r!   Variablerh   int64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer.   s    r   r   zGradientAccumulator.stepE  sk     $ "ARXX. " : : B B22EE	!D   &&((r   c                     | j                   st        d      | j                   D cg c]  }||j                         n| c}S c c}w )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradients)r   
ValueErrorr   r   gradients     r   	gradientszGradientAccumulator.gradientsR  sB     abbW[WfWfg8H$8 hFgggs   Ac                    | j                   s| j                  }| j                   j                  |D cg c]b  }|\t        j                  t        j
                  |      dt        j                  j                  t        j                  j                        n|d c}       t        |      t        | j                         k7  r-t        dt        | j                          dt        |             t        | j                   |      D ]  \  }}|	||j                  |        | j                  j                  d       yc c}w )z/Accumulates `gradients` on the current replica.NFr   z	Expected z gradients, but got r   )r   r   extendr!   r   
zeros_liker   r   r   r   lenr   rz   
assign_addr   )r   r   _r   accum_gradients        r   r+   zGradientAccumulator.__call__Y  s   		AOO"" %.
 !  + KKh/"'(*(B(B(J(J$&$:$:$M$M	 ""
 y>S11yT__)=(>>RSVW`SaRbcdd(+DOOY(G 	4$NH)h.B))(3	4 	$$Q''
s   A'D?c                     | j                   sy| j                  j                  d       | j                   D ])  }||j                  t        j                  |             + y)z8Resets the accumulated gradients on the current replica.Nr   )r   r   assignr!   r   r   s     r   resetzGradientAccumulator.resets  sN      # 	9H#h 78	9r   N)
r1   r2   r3   r4   r   propertyr   r   r+   r   r   r   r   r   r   5  s@    !
 
) 
) h h(49r   r   )	rH   r   r   g:0yE>NNrH   r0   N)r4   typingr   r   r   
tensorflowr!   tf_keras.optimizers.legacyr   ImportErrorModuleNotFoundError"tensorflow.keras.optimizers.legacymodeling_tf_utilsr   hasattrrV   rS   r	   r   r   r5   r6   ry   r7   rY   rU   r   r   r   r   <module>r      s`   F , , 8/ % 5%%'?@  **AAI  **I7
Y++ 7
| %),0"37Q"Q"Q" Q" 	Q"
 Q" Q" Q" E?Q" #5/Q" Q" Q" &d3i0Q"h~d ~DE9 E9{ 	() 878s   C CC