
    rh6                        d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dl	Z	d dlm
Z
mZ d dlmZmZ d dlmZ d d	lmZ  G d
 d      Z	 	 	 	 	 	 	 	 ddZ G d de      Zy)    )annotations)Iterator)nullcontext)partialN)Tensornn)get_device_statesset_device_states)CrossEncoder)MultipleNegativesRankingLossc                  (    e Zd ZdZddZddZddZy)RandContexta  
    Random-state context manager class. Reference: https://github.com/luyug/GradCache.

    This class will back up the pytorch's random state during initialization. Then when the context is activated,
    the class will set up the random state with the backed-up one.
    c                `    t        j                         | _        t        | \  | _        | _        y N)torchget_rng_statefwd_cpu_stater	   fwd_gpu_devicesfwd_gpu_states)selftensorss     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/cross_encoder/losses/CachedMultipleNegativesRankingLoss.py__init__zRandContext.__init__   s(    "0024Ew4O1d1    c                   t         j                  j                  | j                  d      | _        | j                  j                          t        j                  | j                         t        | j                  | j                         y )NT)devicesenabled)
r   randomfork_rngr   _fork	__enter__set_rng_stater   r
   r   )r   s    r   r!   zRandContext.__enter__   s^    \\**43G3GQU*V


D../$..0C0CDr   c                L    | j                   j                  |||       d | _         y r   )r    __exit__)r   exc_typeexc_valexc_tbs       r   r$   zRandContext.__exit__"   s    

Hgv6
r   N)returnNone)__name__
__module____qualname____doc__r   r!   r$    r   r   r   r      s    PEr   r   "CachedMultipleNegativesRankingLossc           	        |j                   J |j                  J t        j                         5  t	        |j                  |dd|j                        |j                         D ]M  \  \  }}}t        j                  |j                         |j                               | z  }|j                          O 	 ddd       y# 1 sw Y   yxY w)zOA backward hook to backpropagate the cached gradients mini-batch by mini-batch.NTF)pairs	with_gradcopy_random_staterandom_states)	cacher4   r   enable_gradzippredict_minibatch_iterdotflattenbackward)grad_outputr1   loss_objminibatch_logits_minibatch_grad	surrogates          r   _backward_hookrB   '   s     >>%%%!!---				 !58++"'&44	 ,  NN6
 
	!1!q> 		"2":":"<n>T>T>VWZeeI 
	!! ! !s   BB??Cc                       e Zd Zdd ej                         ddf	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 d	 	 	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 	 	 ddZddZdd	Z	 fd
Z
 xZS )r/      g      $@    Fc                   t         |   ||||       || _        || _        t	        j
                         | _        d| _        d| _        t        | j                  t              s8t        | j                  j                   dt        | j                         d      | j                  j                   dk7  r9t        | j                  j                   d| j                  j                    d      y)a  
        Boosted version of :class:`~sentence_transformers.cross_encoder.losses.MultipleNegativesRankingLoss` that
        caches the gradients of the logits wrt. the loss. This allows for much higher batch sizes without extra
        memory usage. However, it is slightly slower.

        In detail:

            (1) It first does a quick prediction step without gradients/computation graphs to get all the logits;
            (2) Calculate the loss, backward up to the logits and cache the gradients wrt. to the logits;
            (3) A 2nd prediction step with gradients/computation graphs and connect the cached gradients into the backward chain.

        Notes: All steps are done with mini-batches. In the original implementation of GradCache, (2) is not done in
        mini-batches and requires a lot memory when the batch size is large. The gradient caching will sacrifice around
        20% computation time according to the paper.

        Given a list of (anchor, positive) pairs or (anchor, positive, negative) triplets, this loss optimizes the following:

        * Given an anchor (e.g. a question), assign the highest similarity to the corresponding positive (i.e. answer)
          out of every single positive and negative (e.g. all answers) in the batch.

        If you provide the optional negatives, they will all be used as extra options from which the model must pick the
        correct positive. Within reason, the harder this "picking" is, the stronger the model will become. Because of
        this, a higher batch size results in more in-batch negatives, which then increases performance (to a point).

        This loss function works great to train embeddings for retrieval setups where you have positive pairs
        (e.g. (query, answer)) as it will sample in each batch ``n-1`` negative docs randomly.

        This loss is also known as InfoNCE loss with GradCache.

        Args:
            model (:class:`~sentence_transformers.cross_encoder.CrossEncoder`): A CrossEncoder model to be trained.
            num_negatives (int, optional): Number of in-batch negatives to sample for each anchor. Defaults to 4.
            scale (int, optional): Output of similarity function is multiplied by scale value. Defaults to 10.0.
            activation_fn (:class:`~torch.nn.Module`): Activation function applied to the logits before computing the loss. Defaults to :class:`~torch.nn.Sigmoid`.
            mini_batch_size (int, optional): Mini-batch size for the forward pass. This informs the memory usage. Defaults to 32.
            show_progress_bar (bool, optional): Whether to show a progress bar during the forward pass. Defaults to False.

        .. note::

            The current default values are subject to change in the future. Experimentation is encouraged.

        References:
            - Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4: https://arxiv.org/pdf/1705.00652.pdf
            - Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup: https://arxiv.org/pdf/2101.06983.pdf
            - `Cross Encoder > Training Examples > MS MARCO <../../../examples/cross_encoder/training/ms_marco/README.html>`_
            - `Cross Encoder > Training Examples > Rerankers <../../../examples/cross_encoder/training/rerankers/README.html>`_

        Requirements:
            1. Your model must be initialized with `num_labels = 1` (a.k.a. the default) to predict one class.
            2. Should be used with large `per_device_train_batch_size` and low `mini_batch_size` for superior performance,
               but slower training time than :class:`MultipleNegativesRankingLoss`.

        Inputs:
            +-------------------------------------------------+--------+-------------------------------+
            | Texts                                           | Labels | Number of Model Output Labels |
            +=================================================+========+===============================+
            | (anchor, positive) pairs                        | none   | 1                             |
            +-------------------------------------------------+--------+-------------------------------+
            | (anchor, positive, negative) triplets           | none   | 1                             |
            +-------------------------------------------------+--------+-------------------------------+
            | (anchor, positive, negative_1, ..., negative_n) | none   | 1                             |
            +-------------------------------------------------+--------+-------------------------------+

        Recommendations:
            - Use ``BatchSamplers.NO_DUPLICATES`` (:class:`docs <sentence_transformers.training_args.BatchSamplers>`) to
              ensure that no in-batch negatives are duplicates of the anchor or positive samples.
            - Use :class:`~sentence_transformers.util.mine_hard_negatives` with ``output_format="n-tuple"`` or
              ``output_format="triplet"`` to convert question-answer pairs to triplets with hard negatives.

        Relations:
            - Equivalent to :class:`~sentence_transformers.cross_encoder.losses.MultipleNegativesRankingLoss`, but with
              caching that allows for much higher batch sizes (and thus better performance) without extra memory usage.
              This loss also trains slower than :class:`~sentence_transformers.cross_encoder.losses.MultipleNegativesRankingLoss`.

        Example:
            ::

                from sentence_transformers.cross_encoder import CrossEncoder, CrossEncoderTrainer, losses
                from datasets import Dataset

                model = CrossEncoder("microsoft/mpnet-base")
                train_dataset = Dataset.from_dict({
                    "query": ["What are pandas?", "What is the capital of France?"],
                    "answer": ["Pandas are a kind of bear.", "The capital of France is Paris."],
                })
                loss = losses.CachedMultipleNegativesRankingLoss(model, mini_batch_size=32)

                trainer = CrossEncoderTrainer(
                    model=model,
                    train_dataset=train_dataset,
                    loss=loss,
                )
                trainer.train()
        Nz? expects a model of type CrossEncoder, but got a model of type .   z; expects a model with 1 output label, but got a model with z output labels.)superr   mini_batch_sizeshow_progress_barr   CrossEntropyLosscross_entropy_lossr5   r4   
isinstancemodelr   
ValueError	__class__r*   type
num_labels)r   rO   num_negativesscaleactivation_fnrJ   rK   rQ   s          r   r   z+CachedMultipleNegativesRankingLoss.__init__?   s    N 	umD.!2"$"5"5"704
=A$**l3>>**+ ,++/

+;*<A? 
 ::  A%>>**+ ,((,

(=(='>oO  &r   c                   |rt         nt        j                  }|
t               n|}|5   |       5  |rt        |      nd}| j	                  |      }ddd       ddd       |fS # 1 sw Y   xY w# 1 sw Y   |fS xY w)zYDo forward pass on a minibatch of the input features and return corresponding embeddings.N)r   r   no_gradr   call_model_with_pairs)r   r1   r2   r3   random_stategrad_contextrandom_state_contextlogitss           r   predict_minibatchz4CachedMultipleNegativesRankingLoss.predict_minibatch   s     '0{U]]0<0D{},! 	; ;5F{51D33E:;	; |##; ;	; |##s"   A2!A&A2&A/	+A22A>c           	   #    K   t        t        j                  dt        |      | j                  d| j
                               D ]>  \  }}|| j                  z   }||| }| j                  ||||dn||         \  }	}
|	|
f @ yw)z`Do forward pass on all the minibatches of the input features and yield corresponding embeddings.r   zPredict mini-batches)descdisableN)r1   r2   r3   rZ   )	enumeratetqdmtrangelenrJ   rK   r^   )r   r1   r2   r3   r4   ibemini_batch_pairsr]   rZ   s              r   r8   z9CachedMultipleNegativesRankingLoss.predict_minibatch_iter   s      KKE
$$+ 222
 	'DAq D(((A$Qqz#'#9#9&#"3%2%:Ta@P	 $: $ FL ,&&%	's   BBc                    | j                  ||      }|j                          |j                         j                         }|D cg c]  }|j                   c}| _        |S c c}w )zMCalculate the cross-entropy loss and cache the gradients wrt. the embeddings.)calculate_lossr;   detachrequires_grad_gradr5   )r   r]   
batch_sizelosslogits        r   "calculate_loss_and_cache_gradientszECachedMultipleNegativesRankingLoss.calculate_loss_and_cache_gradients   sQ    ""6:6{{}++-.45Uejj5
 6s   A"c                   |d   d d  }|d   d d  }t        |      }| j                  |d   |dd        D ]'  }|j                  |d          |j                  |       ) |dd  D ]'  }|j                  |d          |j                  |       ) t        t	        ||            }g }g | _        | j                  |dd      D ]M  \  }	}
|j                  |	j                         j                                | j
                  j                  |
       O t        j                         r5| j                  ||      }|j                  t        t        ||              |S | j!                  ||      }|S )Nr   rH      FT)r1   r2   r3   )r1   r=   )re   get_in_batch_negativesextendlistr7   r4   r8   appendrl   rm   r   is_grad_enabledrr   register_hookr   rB   rk   )r   inputslabelsanchors
candidatesro   	negativesr1   r]   r>   rZ   rp   s               r   forwardz*CachedMultipleNegativesRankingLoss.forward   s}   )B-AYr]
\
 44VAYqr
K 	)INN6!9%i(	)
   	)INN6!9%i(	) S*-..2.I.I" /J /
 	4*l
 MM*113BBDE%%l3	4   "::6:ND w~UTRS
  &&vz:Dr   c                @    i t         |          d| j                  iS )NrJ   )rI   get_config_dictrJ   )r   rQ   s    r   r   z2CachedMultipleNegativesRankingLoss.get_config_dict  s$    U%')+U->@T@TUUr   )rO   r   rT   z
int | NonerU   floatrV   znn.Module | NonerJ   intrK   boolr(   r)   r   )
r1   list[list[str]]r2   r   r3   r   rZ   zRandContext | Noner(   z!tuple[Tensor, RandContext | None])
r1   r   r2   r   r3   r   r4   zlist[RandContext] | Noner(   z+Iterator[tuple[Tensor, RandContext | None]])r]   zlist[Tensor]ro   r   r(   r   )r{   r   r|   r   r(   r   )r*   r+   r,   r   Sigmoidr   r^   r8   rr   r   r   __classcell__)rQ   s   @r   r/   r/   >   s     %&*4"**,!"'yy "y 	y
 (y y  y 
y@ ,0$$ $  	$
 )$ 
+$* 37'' '  	'
 0' 
5'8&PV Vr   )r<   r   r1   r   r=   r/   r(   r)   )
__future__r   collections.abcr   
contextlibr   	functoolsr   r   rc   r   r   torch.utils.checkpointr	   r
   0sentence_transformers.cross_encoder.CrossEncoderr   Gsentence_transformers.cross_encoder.losses.MultipleNegativesRankingLossr   r   rB   r/   r.   r   r   <module>r      sg    " $ "     G I p .!!! 1! 
	!.[V)E [Vr   