
    rhs.                    
   d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 	 d dlm
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ  ej8                  e      Z G d de      Zy# e$ r	 d dlm
Z
 Y Ww xY w)    )annotationsN)Path)Any)Self)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)InputModule)get_device_namec                      e Zd Z	 	 d	 	 	 	 	 	 	 d fdZddZddZedd       ZddZddddZ	e
	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd	       Ze
	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
       Ze
dd       Z xZS )StaticEmbeddingc                   t         |           t        |t              r|j                  }nt        |t
              st        d      |Vt        |t        j                        rt        j                  |      }t        j                  j                  |d      | _        n7|*t        j                  |j                         |      | _        nt        d      | j                  j                   | _        | j                  j"                  | _        || _        | j$                  j'                          |j)                  dd      | _        y)a4	  
        Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
        takes the mean of trained per-token embeddings to compute text embeddings.

        Args:
            tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
                from ``transformers`` or ``tokenizers``.
            embedding_weights (np.ndarray | torch.Tensor | None, optional): Pre-trained embedding weights.
                Defaults to None.
            embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
                is not provided. Defaults to None.

        .. tip::

            Due to the extremely efficient nature of this module architecture, the overhead for moving inputs to the
            GPU can be larger than the actual computation time. Therefore, consider using a CPU device for inference
            and training.

        Example::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.models import StaticEmbedding
            from tokenizers import Tokenizer

            # Pre-distilled embeddings:
            static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
            # or distill your own embeddings:
            static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
            # or start with randomized embeddings:
            tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
            static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

            model = SentenceTransformer(modules=[static_embedding])

            embeddings = model.encode(["What are Pandas?", "The giant panda, also known as the panda bear or simply the panda, is a bear native to south central China."])
            similarity = model.similarity(embeddings[0], embeddings[1])
            # tensor([[0.8093]]) (If you use potion-base-8M)
            # tensor([[0.6234]]) (If you use the distillation method)
            # tensor([[-0.0693]]) (For example, if you use randomized embeddings)

        Raises:
            ValueError: If the tokenizer is not a fast tokenizer.
            ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
        zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer
   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr	   EmbeddingBagfrom_pretrained	embeddingget_vocab_sizenum_embeddingsembedding_dim	tokenizer
no_paddinggetr   )selfr!   embedding_weightsr    kwargs	__class__s        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/models/StaticEmbedding.pyr   zStaticEmbedding.__init__   s   f 	i!89!,,IIy1^ 
 (+RZZ8$)$4$45F$G!__<<=NW\<]DN&__Y-E-E-GWDN^__"nn;;!^^99$-!!# !**\48    c                   | j                   j                  |d      }|D cg c]  }|j                   }}t        j                  t        j                  dg|d d D cg c]  }t        |       c}z               }t        j                  |D cg c]  }|D ]  }|  c}}t        j                        }	|	|dS c c}w c c}w c c}}w )NF)add_special_tokensr   )dtype)	input_idsoffsets)
r!   encode_batchidsr   r   r   cumsumlentensorlong)
r$   textsr&   	encodingsencodingencodings_ids	token_idsr/   token_idr.   s
             r(   tokenizezStaticEmbedding.tokenizem   s    NN//%/P	6?@(@@""299aSTabeceTf3gyC	N3g-g#hiLLM!dyZc!dh(!d(!dlqlvlvw	&7;;	 A3g!ds   B>CC
c                <    | j                  |d   |d         |d<   |S )Nr.   r/   sentence_embedding)r   )r$   featuresr&   s      r(   forwardzStaticEmbedding.forwardu   s(    )-8MxXaOb)c%&r)   c                "    t         j                  S N)mathinfr$   s    r(   max_seq_lengthzStaticEmbedding.max_seq_lengthy   s    xxr)   c                    | j                   S rB   )r    rE   s    r(    get_sentence_embedding_dimensionz0StaticEmbedding.get_sentence_embedding_dimension}   s    !!!r)   T)safe_serializationc               ^   |r9t        | j                         t        j                  j	                  |d             nBt        j                  | j                         t        j                  j	                  |d             | j                  j                  t        t        |      dz               y )Nzmodel.safetensorszpytorch_model.bintokenizer.json)
save_safetensors_file
state_dictospathjoinr   saver!   strr   )r$   output_pathrI   argsr&   s        r(   rQ   zStaticEmbedding.save   sm    !$//"3RWW\\+Ob5cdJJt("'',,{DW*XYC[ 14D DEFr)   c                    |||||d} | j                   |fddi|}	t        j                  |	      }
 | j                  dd|i|}	 |d   }t        |
|      S # t        $ r |d   }Y w xY w)	N)	subfoldertokencache_folderrevisionlocal_files_onlyfilenamerK   model_name_or_pathzembedding.weight
embeddings)r%    )load_file_pathr   	from_fileload_torch_weightsKeyErrorr   )clsr\   rV   rW   rX   rY   rZ   r&   
hub_kwargstokenizer_pathr!   weightss               r(   loadzStaticEmbedding.load   s     #(  0

 ,++,>hIYh]gh''7	(#((]<N]R\]	,01G yGDD  	,l+G	,s   A A+*A+c
           	        	 ddl m} t        j                  |      }t        |j                  j                               dhz
  }|||||	|||d|
}
t        |
j                               |z
  x}r^t        j                  ddj                  t        t        |             d       |
j                         D ci c]  \  }}||v s|| }
}}t               } ||fi |
}t        |j                   t"        j$                        r t'        j(                  |j                         }n|j                   j*                  }|j,                  } | |||	      S # t        $ r t        d      w xY wc c}}w )
ak  
        Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

        Args:
            model_name (str): The name of the model to distill.
            vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
            device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
                the strongest device is automatically detected. Defaults to None.
            pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
            apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
            sif_coefficient (float | None, optional): The coefficient for SIF weighting. Defaults to 1e-4.
            token_remove_pattern (str | None, optional): A regex pattern to remove tokens from the vocabulary.
                Defaults to r"\[unused\d+\]".
            quantize_to (str): The data type to quantize the weights to. Defaults to 'float32'.
            use_subword (bool): Whether to use subword tokenization. Defaults to True.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
                tokenizer and embedding weights.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`
model_name)
vocabularydevicepca_dims
apply_zipfuse_subwordquantize_tosif_coefficienttoken_remove_patternz1Your version of `model2vec` does not support the z, zh arguments for the `distill` method. Consider updating `model2vec` to take advantage of these arguments.r%   r   )model2vec.distillri   ImportErrorinspect	signatureset
parameterskeysloggerwarningrP   maprepritemsr   r   r   r   r   r   r   weightr!   )rc   rj   rk   rl   rm   rn   rq   rr   rp   ro   r&   ri   distill_signaturedistill_kwargs	leftoverskeyvaluestatic_modelr%   r!   s                       r(   from_distillationz!StaticEmbedding.from_distillation   sx   L	1 $--g6.99>>@A\NR$ $&&.$8

 

 FKKM*^;;9;NNCDIIcRVXaNbDcCd eV V 4:<<>[ZS%SNEZc5j[F[ "z4V4l,,bjj9 % 0 01G1G H , 6 6 = =+55	90AjYYC  	n 	. \s   E 8E)E)E&c                D   	 ddl m} |j                  |      }t	        |j
                  t        j                        r t        j                  |j
                        }n|j
                  j                  }|j                  } | |||      S # t        $ r t        d      w xY w)aH  
        Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
        and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

        Args:
            model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
                 the model2vec model.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`rs   )	model2vecr   ru   r   r   r   r   r   r   r   r   r!   )rc   model_id_or_pathr   r   r%   r!   s         r(   from_model2veczStaticEmbedding.from_model2vec   s    "	u- #223CDl,,bjj9 % 0 01G1G H , 6 6 = =+55	90AN^__  	ustt	us   B
 
B)NN)r!   z#Tokenizer | PreTrainedTokenizerFastr%   z np.ndarray | torch.Tensor | Noner    
int | NonereturnNone)r6   z	list[str]r   dict[str, torch.Tensor])r?   r   r   r   )r   int)rS   rR   rI   boolr   r   ) NNNF)r\   rR   rV   rR   rW   zbool | str | NonerX   
str | NonerY   r   rZ   r   r   r   )NN   Tg-C6?z\[unused\d+\]float32T)rj   rR   rk   zlist[str] | Nonerl   r   rm   r   rn   r   rq   zfloat | Nonerr   r   rp   rR   ro   r   r&   r   r   r   )r   rR   r   r   )__name__
__module____qualname__r   r<   r@   propertyrF   rH   rQ   classmethodrg   r   r   __classcell__)r'   s   @r(   r   r      s    ?C$(	N96N9 <N9 "	N9 
N9`<  " HL G  #'#'#!&EE E !	E
 !E E E 
E E8  (,!"(,+;$ HZHZ %HZ 	HZ
 HZ HZ &HZ )HZ HZ HZ HZ 
HZ HZT ` `r)   r   ) 
__future__r   rv   loggingrC   rN   pathlibr   typingr   r   ru   typing_extensionsnumpyr   r   safetensors.torchr   rL   
tokenizersr   r	   transformersr
   (sentence_transformers.models.InputModuler   sentence_transformers.utilr   	getLoggerr   r{   r   r^   r)   r(   <module>r      sr    "    	  '   @    0 @ 6			8	$p`k p`!  '&'s   A4 4BB