
    rh	                    Z    d Z ddlmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ  G d de      Zy)	a  
This file contains deprecated code that can only be used with the old `model.fit`-style Sentence Transformers v2.X training.
It exists for backwards compatibility with the `model.old_fit` method, but will be removed in a future version.

Nowadays, with Sentence Transformers v3+, it is recommended to use the `SentenceTransformerTrainer` class to train models.
See https://www.sbert.net/docs/sentence_transformer/training_overview.html for more information.

See this script for more details on how to use the new training API:
https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/unsupervised_learning/TSDAE/train_stsb_tsdae.py
    )annotationsN)Dataset)NLTK_IMPORT_ERRORis_nltk_available)InputExamplec                  <    e Zd ZdZd fddZd Zd Zed	d       Zy)
DenoisingAutoEncoderDataseta  
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    Args:
        sentences: A list of sentences
        noise_fn: A noise function: Given a string, it returns a string
            with noise, e.g. deleted words
    c                ,    t         j                  |       S N)r	   delete)ss    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py<lambda>z$DenoisingAutoEncoderDataset.<lambda>!   s    @[@b@bcd@e     c                    t               s2t        t        j                  | j                  j
                              || _        || _        y r   )r   ImportErrorr   format	__class____name__	sentencesnoise_fn)selfr   r   s      r   __init__z$DenoisingAutoEncoderDataset.__init__!   s7     "/66t~~7N7NOPP" r   c                Z    | j                   |   }t        | j                  |      |g      S )N)texts)r   r   r   )r   itemsents      r   __getitem__z'DenoisingAutoEncoderDataset.__getitem__(   s)    ~~d#4==#6"=>>r   c                ,    t        | j                        S r   )lenr   )r   s    r   __len__z#DenoisingAutoEncoderDataset.__len__,   s    4>>""r   c                N   ddl m} ddlm}  ||       }t	        |      }|dk(  r| S t
        j                  j                  |      |kD  }t        |      dk(  r"d|t
        j                  j                  |      <    |       j                  t        j                  |      |         }|S )Nr   )word_tokenize)TreebankWordDetokenizerT)nltkr#   nltk.tokenize.treebankr$   r    nprandomrandsumchoice
detokenizearray)text	del_ratior#   r$   wordsnkeep_or_notwords_processeds           r   r   z"DenoisingAutoEncoderDataset.delete0   s    &Bd#J6KiinnQ')3{q /3K		((+,13>>rxx{?[\r   N)r   z	list[str])g333333?)	r   
__module____qualname____doc__r   r   r!   staticmethodr    r   r   r	   r	      s0    	 7f !?#  r   r	   )r6   
__future__r   numpyr'   torch.utils.datar   transformers.utils.import_utilsr   r   *sentence_transformers.readers.InputExampler   r	   r8   r   r   <module>r>      s'   	 #  $ P C)' )r   