
    rh'!                    n    d dl mZ d dlmZ d dlZd dlmZmZ d dlmZm	Z	  G d dej                        Zy)    )annotations)IterableN)Tensornn)SentenceTransformerutilc                  f     e Zd Zej                  df	 	 	 	 	 d fdZddZddZed	d       Z	 xZ
S )
DistillKLDivLossg      ?c                    t         |           || _        || _        || _        t        j                  d      | _        y)a  
        Compute the KL divergence loss between probability distributions derived from student and teacher models' similarity scores.
        By default, similarity is calculated using the dot-product. This loss is designed for knowledge distillation
        where a smaller student model learns from a more powerful teacher model.

        The loss computes softmax probabilities from the teacher similarity scores and log-softmax probabilities
        from the student model, then calculates the KL divergence between these distributions.

        Args:
            model: SentenceTransformer model (student model)
            similarity_fct: Which similarity function to use for the student model
            temperature: Temperature parameter to soften probability distributions (higher temperature = softer distributions)
                A temperature of 1.0 does not scale the scores. Note: in the v5.0.1 release, the default temperature was changed from 2.0 to 1.0.

        References:
            - For more details, please refer to https://arxiv.org/abs/2010.11386

        Requirements:
            1. (query, positive, negative_1, ..., negative_n) examples
            2. Labels containing teacher model's scores between query-positive and query-negative pairs

        Inputs:
            +------------------------------------------------+------------------------------------------------------------+
            | Texts                                          | Labels                                                     |
            +================================================+============================================================+
            | (query, positive, negative)                    | [Teacher(query, positive), Teacher(query, negative)]       |
            +------------------------------------------------+------------------------------------------------------------+
            | (query, positive, negative_1, ..., negative_n) | [Teacher(query, positive), Teacher(query, negative_i)...]  |
            +------------------------------------------------+------------------------------------------------------------+

        Relations:
            - Similar to :class:`~sentence_transformers.losses.MarginMSELoss` but uses KL divergence instead of MSE
            - More suited for distillation tasks where preserving ranking is important

        Example:

            Using a teacher model to compute similarity scores for distillation:

            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset
                import torch

                student_model = SentenceTransformer("microsoft/mpnet-base")
                teacher_model = SentenceTransformer("all-mpnet-base-v2")
                train_dataset = Dataset.from_dict({
                    "query": ["It's nice weather outside today.", "He drove to work."],
                    "positive": ["It's so sunny.", "He took the car to work."],
                    "negative": ["It's very cold.", "She walked to the store."],
                })

                def compute_labels(batch):
                    emb_queries = teacher_model.encode(batch["query"])
                    emb_positives = teacher_model.encode(batch["positive"])
                    emb_negatives = teacher_model.encode(batch["negative"])

                    pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
                    neg_scores = teacher_model.similarity_pairwise(emb_queries, emb_negatives)

                    # Stack the scores for positive and negative pairs
                    return {
                        "label": torch.stack([pos_scores, neg_scores], dim=1)
                    }

                train_dataset = train_dataset.map(compute_labels, batched=True)
                loss = losses.DistillKLDivLoss(student_model)

                trainer = SentenceTransformerTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
                trainer.train()

            With multiple negatives:

            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset
                import torch

                student_model = SentenceTransformer("microsoft/mpnet-base")
                teacher_model = SentenceTransformer("all-mpnet-base-v2")

                train_dataset = Dataset.from_dict(
                    {
                        "query": ["It's nice weather outside today.", "He drove to work."],
                        "positive": ["It's so sunny.", "He took the car to work."],
                        "negative1": ["It's very cold.", "She walked to the store."],
                        "negative2": ["Its rainy", "She took the bus"],
                    }
                )


                def compute_labels(batch):
                    emb_queries = teacher_model.encode(batch["query"])
                    emb_positives = teacher_model.encode(batch["positive"])
                    emb_negatives1 = teacher_model.encode(batch["negative1"])
                    emb_negatives2 = teacher_model.encode(batch["negative2"])

                    pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
                    neg_scores1 = teacher_model.similarity_pairwise(emb_queries, emb_negatives1)
                    neg_scores2 = teacher_model.similarity_pairwise(emb_queries, emb_negatives2)

                    # Stack the scores for positive and multiple negative pairs
                    return {
                        "label": torch.stack([pos_scores, neg_scores1, neg_scores2], dim=1)
                    }

                train_dataset = train_dataset.map(compute_labels, batched=True)
                loss = losses.DistillKLDivLoss(student_model)

                trainer = SentenceTransformerTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
                trainer.train()
        	batchmean)	reductionN)super__init__modelsimilarity_fcttemperaturer   	KLDivLossloss_fct)selfr   r   r   	__class__s       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/losses/DistillKLDivLoss.pyr   zDistillKLDivLoss.__init__   s8    h 	
,&{;    c                r    |D cg c]  }| j                  |      d    }}| j                  ||      S c c}w )Nsentence_embedding)r   compute_loss_from_embeddings)r   sentence_featureslabelssentence_feature
embeddingss        r   forwardzDistillKLDivLoss.forward   s?    arsM]djj!123GHs
s00VDD ts   4c           	     j   |d   }t        j                  |dd  D cg c]  }| j                  ||       c}d      }|| j                  z  }t        j                  |d      }|| j                  z  }t        j
                  |d      }| j                  ||      }	|	| j                  dz  z  }	|	S c c}w )Nr      )dim   )torchstackr   r   log_softmaxsoftmaxr   )
r   r   r   embeddings_queryembeddings_otherstudent_scoresstudent_log_probsteacher_scoresteacher_probslosss
             r   r   z-DistillKLDivLoss.compute_loss_from_embeddings   s    %a= ]ghihj]klIYT  !13CDl

 ($*:*::!--n!D  $"2"22n!< }}.>t''*+ ms   B0c                     y)Nai  
@misc{lin2020distillingdenserepresentationsranking,
      title={Distilling Dense Representations for Ranking using Tightly-Coupled Teachers},
      author={Sheng-Chieh Lin and Jheng-Hong Yang and Jimmy Lin},
      year={2020},
      eprint={2010.11386},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2010.11386},
}
 )r   s    r   citationzDistillKLDivLoss.citation   s    
r   )r   r   r   floatreturnNone)r   zIterable[dict[str, Tensor]]r   r   r4   r   )r   zlist[Tensor]r   r   r4   r   )r4   str)__name__
__module____qualname__r   pairwise_dot_scorer   r    r   propertyr2   __classcell__)r   s   @r   r
   r
      sP    9=9P9Pgjx<(x<_dx<	x<tE
,  r   r
   )
__future__r   collections.abcr   r%   r   r   sentence_transformersr   r   Moduler
   r1   r   r   <module>rA      s&    " $   ;bryy br   