
    rh"                        d dl mZ d dlZd dlZd dlZd dlmZ 	 d dlmZ d dl
Z
d dlmZ d dlmZ erd dlmZ  ej                   e      Z G d d	e      Zy# e$ r	 d dl	mZ Y Cw xY w)
    )annotationsN)TYPE_CHECKING)Self)AutoTokenizer)InputModule)PreTrainedTokenizerc                       e Zd ZU dZdgZded<   	 	 d	 	 	 	 	 d fdZddZdddd	Ze		 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd
       Z
e		 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd       ZddZddZ	 d	 	 	 	 	 ddZ xZS )SparseStaticEmbeddinga  
    SparseStaticEmbedding module for efficient sparse representations.

    This lightweight module computes sparse representations by mapping input tokens to static weights,
    such as IDF (Inverse Document Frequency) weights. It is designed to encode queries or documents
    into fixed-size embeddings based on the presence of tokens in the input.

    A common scenario is to use this module for encoding queries, and using a heavier module like
    SPLADE (MLMTransformer + SpladePooling) for document encoding.

    Args:
        tokenizer (PreTrainedTokenizer): PreTrainedTokenizer to tokenize input texts into input IDs.
        weight (torch.Tensor | None): Static weights for vocabulary tokens (e.g., IDF weights),
            shape should be (vocab_size,). If None, initializes weights to a vector of ones.
            Default is None.
        frozen (bool): Whether the weights should be frozen (not trainable). Default is False.
    frozenz	list[str]config_keysc                   t         |           || _        |(t        j                  j                  ||       | _        n[t        j                  j                  t        j                  t        | j                  j                                     |       | _        || _
        | j                  j                  d      | _        | j                  j                  | _        y )N)requires_gradr   )super__init__	tokenizertorchnn	Parameterweightoneslen	get_vocabr   sizenum_dimensionsmodel_max_lengthmax_seq_length)selfr   r   r   	__class__s       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/sparse_encoder/models/SparseStaticEmbedding.pyr   zSparseStaticEmbedding.__init__-   s     	"((,,Vv:,NDK((,,UZZDNN<T<T<V8W-Xlrhr,sDK"kk..q1"nn==    c                .   |d   }|d   }|j                  dd       }|j                  d   }t        j                  || j                  |j
                  t        j                        }|j                  d||       || j                  z  }|||z  }||d<   |S )N	input_idsattention_masksentence_embeddingr   )devicedtype   )	getshaper   zerosr   r%   int64scatter_r   )r   featuresr"   r#   r$   
batch_sizetoken_presence
embeddingss           r   forwardzSparseStaticEmbedding.forward?   s    [)	!"23%\\*>E__Q'
 Z1D1DYM]M]ejepepq 	9n= $dkk1
 )#&88J)3%&r    Tsafe_serializationc               n    | j                  |       | j                  ||       | j                  |       y )Nr2   )save_tokenizersave_torch_weightssave_config)r   output_pathr3   argskwargss        r   savezSparseStaticEmbedding.saveU   s2    K(@RS%r    c                j   t         j                  j                  |      s	 ddlm}  ||d||||      }t        |      5 }	t        j                  |	      }
ddd       t        
j                          \  }}|j                  t        |            }t        j                  |t        j                        }t!        |      d	z   }t        j"                  |t        j                        }t        ||      D ]
  \  }}|||<     | d||d
|S # t
        $ r t        d| d      w xY w# 1 sw Y   xY w)a  
        Create an SparseStaticEmbedding module from a JSON file containing token to IDF weight mappings.

        Args:
            json_path (str): Path to the JSON file containing token to IDF weight mappings.
            tokenizer (PreTrainedTokenizer): Tokenizer to use for converting tokens to IDs.
            token (bool | str | None): Token for Hugging Face authentication
            cache_folder (str | None): Cache folder for Hugging Face
            revision (str | None): Model revision
            local_files_only (bool): Whether to only load local files
            **config: Additional configuration options for the IDF model.

        Returns:
            SparseStaticEmbedding: An initialized SparseStaticEmbedding model.
        r   )hf_hub_downloadzidf.json)repo_idfilenametoken	cache_dirrevisionlocal_files_onlyzIDF JSON file not found at z. Please provide a valid path.N)r&   r'   r   r    )ospathexistshuggingface_hubr=   
ValueErroropenjsonloadzipitemsconvert_tokens_to_idslistr   tensorfloat32maxr*   )cls	json_pathr   r@   cache_folderrB   rC   configr=   fInidftokensweights	token_idsmax_token_idr   token_idws                     r   	from_jsonzSparseStaticEmbedding.from_jsonZ   s*   4 ww~~i(j;+%'*%%5	 )_ 	!))C.C	! syy{+33DLA	,,wemm<9~)\?y'2 	!KHa F8	! @&I@@@  j #>ykIg!hiij	! 	!s   D  D)D&)D2c           	     ,   | j                  ||||||      }t        j                  ||||||      }	|j                  dd      }
|
*|
j	                  d      r | j
                  |
|	f||||d|S  | d	d|	d|}| j                  |||||||      }|S )
a  
        Load the SparseStaticEmbedding module with its tokenizer.

        Args:
            model_name_or_path (str): Path to the directory containing the saved model.
            subfolder (str): Subfolder within the model directory
            token (bool | str | None): Token for Hugging Face authentication
            cache_folder (str | None): Cache folder for Hugging Face
            revision (str | None): Model revision
            local_files_only (bool): Whether to only load local files
            **kwargs: Additional keyword arguments

        Returns:
            SparseStaticEmbedding: The loaded SparseStaticEmbedding module.
        )model_name_or_path	subfolderr@   rW   rB   rC   )rd   r@   rA   rB   rC   rG   Nz.json)r@   rA   rB   rC   rD   )rc   rd   r@   rW   rB   rC   modelrE   )load_configr   from_pretrainedpopendswithra   load_torch_weights)rU   rc   rd   r@   rW   rB   rC   r:   rX   r   rG   re   s               r   rM   zSparseStaticEmbedding.load   s    4 1%- ! 
 "11"-
	 zz&$'g 6 3== &!!1   ?49??&&1%- ' 
 r    c                    d| j                   j                  j                   }d| j                          d| j                   | dS )Nz, tokenizer=zSparseStaticEmbedding(z, dim=))r   r   __name__get_config_dictr   )r   tokenizer_infos     r   __repr__zSparseStaticEmbedding.__repr__   sL    '(@(@(I(I'JK'(<(<(>'?vdFYFYEZ[iZjjkllr    c                    | j                   S )N)r   )r   s    r    get_sentence_embedding_dimensionz6SparseStaticEmbedding.get_sentence_embedding_dimension   s    """r    c           	     @    t        | j                  ||ddd            S )NTptF)padding
truncationreturn_tensorsadd_special_tokens)dictr   )r   textsru   s      r   tokenizezSparseStaticEmbedding.tokenize   s)     NN5'dSWlqNr
 	
r    )NF)r   r   r   ztorch.Tensor | Noner   bool)r-   dict[str, torch.Tensor]returnr}   )r8   strr3   r|   r~   None)NNNF)rV   r   r   r   r@   bool | str | NonerW   
str | NonerB   r   rC   r|   ) NNNF)rc   r   rd   r   r@   r   rW   r   rB   r   rC   r|   r~   r   )r~   r   )r~   int)T)rz   z.list[str] | list[dict] | list[tuple[str, str]]ru   z
str | boolr~   r}   )rm   
__module____qualname____doc__r   __annotations__r   r1   r;   classmethodra   rM   rp   rr   r{   __classcell__)r   s   @r   r
   r
      sW   $ 'ZK'
 '+	>&> $> 	>$, HL &
 
 $(#'#!&4A4A '4A !	4A
 !4A 4A 4A 4Al  #'#'#!&DD D !	D
 !D D D 
D DLm# \`
C
NX
	 
r    r
   )
__future__r   rL   loggingrF   typingr   r   ImportErrortyping_extensionsr   transformersr   (sentence_transformers.models.InputModuler   r   	getLoggerrm   loggerr
   rE   r    r   <module>r      sa    "   	  '  & @0			8	$L
K L
  '&'s   A A$#A$