
    rhQF                        d dl mZ d dlZd dlmZmZ d dlmZmZ d dl	m
Z d dlmZ d dlmZ d dlmZmZ  ej&                  e      Z G d	 d
e      Z G d de      Ze G d de             Zy)    )annotationsN)	dataclassfield)CallableUnion)TrainingArguments)ParallelMode)ExplicitEnum)DefaultBatchSamplerMultiDatasetDefaultBatchSamplerc                      e Zd ZdZdZdZdZy)BatchSamplersac  
    Stores the acceptable string identifiers for batch samplers.

    The batch sampler is responsible for determining how samples are grouped into batches during training.
    Valid options are:

    - ``BatchSamplers.BATCH_SAMPLER``: **[default]** Uses :class:`~sentence_transformers.sampler.DefaultBatchSampler`, the default
      PyTorch batch sampler.
    - ``BatchSamplers.NO_DUPLICATES``: Uses :class:`~sentence_transformers.sampler.NoDuplicatesBatchSampler`,
      ensuring no duplicate samples in a batch. Recommended for losses that use in-batch negatives, such as:

        - :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss`
        - :class:`~sentence_transformers.losses.CachedMultipleNegativesRankingLoss`
        - :class:`~sentence_transformers.losses.MultipleNegativesSymmetricRankingLoss`
        - :class:`~sentence_transformers.losses.CachedMultipleNegativesSymmetricRankingLoss`
        - :class:`~sentence_transformers.losses.MegaBatchMarginLoss`
        - :class:`~sentence_transformers.losses.GISTEmbedLoss`
        - :class:`~sentence_transformers.losses.CachedGISTEmbedLoss`
    - ``BatchSamplers.GROUP_BY_LABEL``: Uses :class:`~sentence_transformers.sampler.GroupByLabelBatchSampler`,
      ensuring that each batch has 2+ samples from the same label. Recommended for losses that require multiple
      samples from the same label, such as:

        - :class:`~sentence_transformers.losses.BatchAllTripletLoss`
        - :class:`~sentence_transformers.losses.BatchHardSoftMarginTripletLoss`
        - :class:`~sentence_transformers.losses.BatchHardTripletLoss`
        - :class:`~sentence_transformers.losses.BatchSemiHardTripletLoss`

    If you want to use a custom batch sampler, then you can subclass
    :class:`~sentence_transformers.sampler.DefaultBatchSampler` and pass the class (not an instance) to the
    ``batch_sampler`` argument in :class:`~sentence_transformers.training_args.SentenceTransformerTrainingArguments`
    (or :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments`, etc.).
    Alternatively, you can pass a function that accepts ``dataset``, ``batch_size``, ``drop_last``,
    ``valid_label_columns``, ``generator``, and ``seed`` and returns a
    :class:`~sentence_transformers.sampler.DefaultBatchSampler` instance.

    Usage:
        ::

            from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
            from sentence_transformers.training_args import BatchSamplers
            from sentence_transformers.losses import MultipleNegativesRankingLoss
            from datasets import Dataset

            model = SentenceTransformer("microsoft/mpnet-base")
            train_dataset = Dataset.from_dict({
                "anchor": ["It's nice weather outside today.", "He drove to work."],
                "positive": ["It's so sunny.", "He took the car to the office."],
            })
            loss = MultipleNegativesRankingLoss(model)
            args = SentenceTransformerTrainingArguments(
                output_dir="checkpoints",
                batch_sampler=BatchSamplers.NO_DUPLICATES,
            )
            trainer = SentenceTransformerTrainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                loss=loss,
            )
            trainer.train()
    batch_samplerno_duplicatesgroup_by_labelN)__name__
__module____qualname____doc__BATCH_SAMPLERNO_DUPLICATESGROUP_BY_LABEL     v/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/training_args.pyr   r      s    <| $M#M%Nr   r   c                      e Zd ZdZdZdZy)MultiDatasetBatchSamplersa  
    Stores the acceptable string identifiers for multi-dataset batch samplers.

    The multi-dataset batch sampler is responsible for determining in what order batches are sampled from multiple
    datasets during training. Valid options are:

    - ``MultiDatasetBatchSamplers.ROUND_ROBIN``: Uses :class:`~sentence_transformers.sampler.RoundRobinBatchSampler`,
      which uses round-robin sampling from each dataset until one is exhausted.
      With this strategy, it's likely that not all samples from each dataset are used, but each dataset is sampled
      from equally.
    - ``MultiDatasetBatchSamplers.PROPORTIONAL``: **[default]** Uses :class:`~sentence_transformers.sampler.ProportionalBatchSampler`,
      which samples from each dataset in proportion to its size.
      With this strategy, all samples from each dataset are used and larger datasets are sampled from more frequently.

    If you want to use a custom multi-dataset batch sampler, then you can subclass
    :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler` and pass the class (not an instance) to the
    ``multi_dataset_batch_sampler`` argument in :class:`~sentence_transformers.training_args.SentenceTransformerTrainingArguments`.
    (or :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments`, etc.). Alternatively,
    you can pass a function that accepts ``dataset`` (a :class:`~torch.utils.data.ConcatDataset`), ``batch_samplers``
    (i.e. a list of batch sampler for each of the datasets in the :class:`~torch.utils.data.ConcatDataset`), ``generator``,
    and ``seed`` and returns a :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler` instance.

    Usage:
        ::

            from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
            from sentence_transformers.training_args import MultiDatasetBatchSamplers
            from sentence_transformers.losses import CoSENTLoss
            from datasets import Dataset, DatasetDict

            model = SentenceTransformer("microsoft/mpnet-base")
            train_general = Dataset.from_dict({
                "sentence_A": ["It's nice weather outside today.", "He drove to work."],
                "sentence_B": ["It's so sunny.", "He took the car to the bank."],
                "score": [0.9, 0.4],
            })
            train_medical = Dataset.from_dict({
                "sentence_A": ["The patient has a fever.", "The doctor prescribed medication.", "The patient is sweating."],
                "sentence_B": ["The patient feels hot.", "The medication was given to the patient.", "The patient is perspiring."],
                "score": [0.8, 0.6, 0.7],
            })
            train_legal = Dataset.from_dict({
                "sentence_A": ["This contract is legally binding.", "The parties agree to the terms and conditions."],
                "sentence_B": ["Both parties acknowledge their obligations.", "By signing this agreement, the parties enter into a legal relationship."],
                "score": [0.7, 0.8],
            })
            train_dataset = DatasetDict({
                "general": train_general,
                "medical": train_medical,
                "legal": train_legal,
            })

            loss = CoSENTLoss(model)
            args = SentenceTransformerTrainingArguments(
                output_dir="checkpoints",
                multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
            )
            trainer = SentenceTransformerTrainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                loss=loss,
            )
            trainer.train()
    round_robinproportionalN)r   r   r   r   ROUND_ROBINPROPORTIONALr   r   r   r   r   T   s    @D  K!Lr   r   c                      e Zd ZU dZg dZ edddi      Zded<    eej                  dd	i      Z
d
ed<    eej                  ddi      Zded<    eeddi      Zded<    eeddi      Zded<    fdZ fdZ xZS )$SentenceTransformerTrainingArgumentsa  
    SentenceTransformerTrainingArguments extends :class:`~transformers.TrainingArguments` with additional arguments
    specific to Sentence Transformers. See :class:`~transformers.TrainingArguments` for the complete list of
    available arguments.

    Args:
        output_dir (`str`):
            The output directory where the model checkpoints will be written.
        prompts (`Union[Dict[str, Dict[str, str]], Dict[str, str], str]`, *optional*):
            The prompts to use for each column in the training, evaluation and test datasets. Four formats are accepted:

            1. `str`: A single prompt to use for all columns in the datasets, regardless of whether the training/evaluation/test
               datasets are :class:`datasets.Dataset` or a :class:`datasets.DatasetDict`.
            2. `Dict[str, str]`: A dictionary mapping column names to prompts, regardless of whether the training/evaluation/test
               datasets are :class:`datasets.Dataset` or a :class:`datasets.DatasetDict`.
            3. `Dict[str, str]`: A dictionary mapping dataset names to prompts. This should only be used if your training/evaluation/test
               datasets are a :class:`datasets.DatasetDict` or a dictionary of :class:`datasets.Dataset`.
            4. `Dict[str, Dict[str, str]]`: A dictionary mapping dataset names to dictionaries mapping column names to
               prompts. This should only be used if your training/evaluation/test datasets are a
               :class:`datasets.DatasetDict` or a dictionary of :class:`datasets.Dataset`.

        batch_sampler (Union[:class:`~sentence_transformers.training_args.BatchSamplers`, `str`, :class:`~sentence_transformers.sampler.DefaultBatchSampler`, Callable[[...], :class:`~sentence_transformers.sampler.DefaultBatchSampler`]], *optional*):
            The batch sampler to use. See :class:`~sentence_transformers.training_args.BatchSamplers` for valid options.
            Defaults to ``BatchSamplers.BATCH_SAMPLER``.
        multi_dataset_batch_sampler (Union[:class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers`, `str`, :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler`, Callable[[...], :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler`]], *optional*):
            The multi-dataset batch sampler to use. See :class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers`
            for valid options. Defaults to ``MultiDatasetBatchSamplers.PROPORTIONAL``.
        router_mapping (`Dict[str, str] | Dict[str, Dict[str, str]]`, *optional*):
            A mapping of dataset column names to Router routes, like "query" or "document". This is used to specify
            which Router submodule to use for each dataset. Two formats are accepted:

            1. `Dict[str, str]`: A mapping of column names to routes.
            2. `Dict[str, Dict[str, str]]`: A mapping of dataset names to a mapping of column names to routes for
               multi-dataset training/evaluation.
        learning_rate_mapping (`Dict[str, float] | None`, *optional*):
            A mapping of parameter name regular expressions to learning rates. This allows you to set different
            learning rates for different parts of the model, e.g., `{'SparseStaticEmbedding\.*': 1e-3}` for the
            SparseStaticEmbedding module. This is useful when you want to fine-tune specific parts of the model
            with different learning rates.
    )accelerator_configfsdp_config	deepspeedgradient_checkpointing_kwargslr_scheduler_kwargspromptsrouter_mappinglearning_rate_mappingNhelpzThe prompts to use for each column in the datasets. Either 1) a single string prompt, 2) a mapping of column names to prompts, 3) a mapping of dataset names to prompts, or 4) a mapping of dataset names to a mapping of column names to prompts.)defaultmetadataz;Union[str, None, dict[str, str], dict[str, dict[str, str]]]r)   zThe batch sampler to use.zRUnion[BatchSamplers, str, DefaultBatchSampler, Callable[..., DefaultBatchSampler]]r   z'The multi-dataset batch sampler to use.zvUnion[MultiDatasetBatchSamplers, str, MultiDatasetDefaultBatchSampler, Callable[..., MultiDatasetDefaultBatchSampler]]multi_dataset_batch_samplerzA mapping of dataset column names to Router routes, like "query" or "document". Either 1) a mapping of column names to routes or 2) a mapping of dataset names to a mapping of column names to routes for multi-dataset training/evaluation. )default_factoryr.   r*   zA mapping of parameter name regular expressions to learning rates. This allows you to set different learning rates for different parts of the model, e.g., {'SparseStaticEmbedding\.*': 1e-3} for the SparseStaticEmbedding module.z"Union[str, None, dict[str, float]]r+   c                   t         |           t        | j                  t              rt        | j                        n| j                  | _        t        | j                  t              rt        | j                        n| j                  | _        | j                  | j                  ni | _        t        | j                  t              rt        d      | j                  | j                  ni | _
        t        | j                  t              rt        d      d| _        d| _        | j                  t        j                  k(  r&| j                   dk7  rt"        j%                  d       y y | j                  t        j&                  k(  r9| j(                  s,| j                   dk7  rt"        j%                  d       d| _        y y y )NzThe `router_mapping` argument must be a dictionary mapping dataset column names to Router routes, like 'query' or 'document'. A stringified dictionary also works.zThe `learning_rate_mapping` argument must be a dictionary mapping parameter name regular expressions to learning rates. A stringified dictionary also works.TFunusedzCurrently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.zWhen using DistributedDataParallel (DDP), it is recommended to set `dataloader_drop_last=True` to avoid hanging issues with an uneven last batch. Setting `dataloader_drop_last=True`.)super__post_init__
isinstancer   strr   r/   r   r*   
ValueErrorr+   prediction_loss_onlyddp_broadcast_buffersparallel_moder	   NOT_DISTRIBUTED
output_dirloggerwarningDISTRIBUTEDdataloader_drop_last)self	__class__s    r   r4   z2SentenceTransformerTrainingArguments.__post_init__   s    2<D<N<NPS1TM$,,-Z^ZlZl 	
 $::C@ &d&F&FG11 	( 6:5H5H5Td11Z\d))3/ S 
 DHC]C]CiT%?%?oq"d00#6 J  %)! &+"!=!== (*v + <#;#;;DD]D] (*; )-D% E^;r   c                h    t         |          }t        |d         r|d= t        |d         r|d= |S )Nr   r/   )r3   to_dictcallable)rA   training_args_dictrB   s     r   rD   z,SentenceTransformerTrainingArguments.to_dict/  sD    "W_.&78"?3&'DEF"#@A!!r   )r   r   r   r   _VALID_DICT_FIELDSr   r)   __annotations__r   r   r   r   r!   r/   dictr*   r+   r4   rD   __classcell__)rB   s   @r   r#   r#      s    'Z	 LQ d
LGH  in++v?Z6[iMe 
 	)66&JsAt	   " 
 SX P
SNO  AF X
A= 7-r" "r   r#   )
__future__r   loggingdataclassesr   r   typingr   r   transformersr   TransformersTrainingArgumentstransformers.training_argsr	   transformers.utilsr
   sentence_transformers.samplerr   r   	getLoggerr   r=   r   r   r#   r   r   r   <module>rU      sl    "  ( " K 3 + ^			8	$A&L A&HD" D"N Y"+H Y" Y"r   