
    rhL                        d dl mZ d dlZd dlZd dlmZ 	 d dlmZ d dl
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ  ej,                  e      Z G d
 de      ZeZy# e$ r	 d dl	mZ Y Ow xY w)    )annotationsN)Path)Self)Tensornn)logging)InputModule)Module)import_from_stringload_dir_pathc                       e Zd ZU dhZddgZded<   dZ	 d	 	 	 	 	 	 	 d fdZe	 	 d	 	 	 	 	 	 	 	 	 dd       Z	ddd	Z
dd
ZdddZdddZe	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zed        Zedd       Zej$                  dd       Z xZS )Routertaskdefault_routeallow_empty_keyz	list[str]config_keyszrouter_config.jsonc           
        t         |           |t        |      dk(  rt        d      |,||vr(t        d| dt	        |j                                      t        j                  |j                         D ci c]  \  }}|t        j                  |  c}}      | _
        |r$|"t        t        |j                                     }|| _        || _        yc c}}w )a  
        This model allows to create asymmetric SentenceTransformer models that apply different modules depending on the specified route,
        such as "query" or "document". Especially useful for models that have different encoders for queries and documents.

        Notably, the ``task`` argument of ``model.encode`` can be used to specify which route to use, and
        ``model.encode_query`` and ``model.encode_document`` are shorthands for using ``task="query"`` and
        ``task="document"``, respectively. These methods also optionally apply ``prompts`` specific to queries
        or documents.

        .. note::

            When training models with the :class:`~sentence_transformers.models.Router` module, you must use the
            ``router_mapping`` argument in the :class:`~sentence_transformers.training_args.SentenceTransformerTrainingArguments`
            or :class:`~sentence_transformers.sparse_encoder.training_args.SparseEncoderTrainingArguments` to map the
            training dataset columns to the correct route ("query" or "document"). For example, if your training dataset(s)
            have ``["question", "positive", "negative"]`` columns, then you can use the following mapping::

                args = SparseEncoderTrainingArguments(
                    ...,
                    router_mapping={
                        "question": "query",
                        "positive": "document",
                        "negative": "document",
                    }
                )

            Additionally, it is common to use a different learning rate for the different routes. For this, you should
            use the ``learning_rate_mapping`` argument in the :class:`~sentence_transformers.training_args.SentenceTransformerTrainingArguments`
            or :class:`~sentence_transformers.sparse_encoder.training_args.SparseEncoderTrainingArguments` to map parameter patterns
            to their learning rates. For example, if you want to use a learning rate of ``1e-3`` for an SparseStaticEmbedding module and
            ``2e-5`` for the rest of the model, you can do this::

                args = SparseEncoderTrainingArguments(
                    ...,
                    learning_rate=2e-5,
                    learning_rate_mapping={
                        r"SparseStaticEmbedding\.*": 1e-3,
                    }
                )

        In the below examples, the ``Router`` model is used to create asymmetric models with different encoders for
        queries and documents. In these examples, the "query" route is efficient (e.g., using SparseStaticEmbedding),
        while the "document" route uses a more complex model (e.g. a Transformers module). This allows for efficient
        query encoding while still using a powerful document encoder, but the combinations are not limited to this.

        Example:
            ::

                from sentence_transformers import SentenceTransformer
                from sentence_transformers.models import Router, Normalize

                # Use a regular SentenceTransformer for the document embeddings, and a static embedding model for the query embeddings
                document_embedder = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
                query_embedder = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1")
                router = Router.for_query_document(
                    query_modules=list(query_embedder.children()),
                    document_modules=list(document_embedder.children()),
                )
                normalize = Normalize()

                # Create an asymmetric model with different encoders for queries and documents
                model = SentenceTransformer(
                    modules=[router, normalize],
                )

                # ... requires more training to align the vector spaces

                # Use the query & document routes
                query_embedding = model.encode_query("What is the capital of France?")
                document_embedding = model.encode_document("Paris is the capital of France.")

            ::

                from sentence_transformers.models import Router
                from sentence_transformers.sparse_encoder import SparseEncoder
                from sentence_transformers.sparse_encoder.models import MLMTransformer, SparseStaticEmbedding, SpladePooling

                # Load an asymmetric model with different encoders for queries and documents
                doc_encoder = MLMTransformer("opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill")
                router = Router.for_query_document(
                    query_modules=[
                        SparseStaticEmbedding.from_json(
                            "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
                            tokenizer=doc_encoder.tokenizer,
                            frozen=True,
                        ),
                    ],
                    document_modules=[
                        doc_encoder,
                        SpladePooling(pooling_strategy="max", activation_function="log1p_relu"),
                    ],
                )

                model = SparseEncoder(modules=[router], similarity_fn_name="dot")

                query = "What's the weather in ny now?"
                document = "Currently New York is rainy."

                query_embed = model.encode_query(query)
                document_embed = model.encode_document(document)

                sim = model.similarity(query_embed, document_embed)
                print(f"Similarity: {sim}")

                # Visualize top tokens for each text
                top_k = 10
                print(f"Top tokens {top_k} for each text:")

                decoded_query = model.decode(query_embed, top_k=top_k)
                decoded_document = model.decode(document_embed)

                for i in range(min(top_k, len(decoded_query))):
                    query_token, query_score = decoded_query[i]
                    doc_score = next((score for token, score in decoded_document if token == query_token), 0)
                    if doc_score != 0:
                        print(f"Token: {query_token}, Query score: {query_score:.4f}, Document score: {doc_score:.4f}")

                '''
                Similarity: tensor([[11.1105]], device='cuda:0')
                Top tokens 10 for each text:
                Token: ny, Query score: 5.7729, Document score: 0.8049
                Token: weather, Query score: 4.5684, Document score: 0.9710
                Token: now, Query score: 3.5895, Document score: 0.4720
                Token: ?, Query score: 3.3313, Document score: 0.0286
                Token: what, Query score: 2.7699, Document score: 0.0787
                Token: in, Query score: 0.4989, Document score: 0.0417
                '''

        Note:
            These models are not necessarily stronger than non-asymmetric models. Rudimentary experiments indicate
            that non-Router models perform better in many cases.

        Args:
            sub_modules: Mapping of route keys to lists of modules. Each key corresponds to a specific task type,
                often "query" or "document", and the list contains the modules to be applied for that task type.
            default_route: The default route to use if no task type is specified. If None, an exception will be thrown
                if no task type is specified. If ``allow_empty_key`` is True, the first key in sub_modules will be used as
                the default route. Defaults to None.
            allow_empty_key: If True, allows the default route to be set to the first key in `sub_modules` if
                ``default_route`` is None. Defaults to True.
        Nr   z&The routes dictionary cannot be empty.zDefault route 'z' not found in route keys: )super__init__len
ValueErrorlistkeysr   
ModuleDictitems
Sequentialsub_modulesnextiterr   r   )selfr   r   r   
route_namemodules	__class__s         v/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/models/Router.pyr   zRouter.__init__   s    ` 	#k"2a"7EFF$k)I}o=XY]^i^n^n^pYqXrstt==LWL]L]L_`5HZZ00`

 }4 k&6&6&8!9:M*. as   ;C
c                &     | ||d|xs d|      S )a  
        Creates a Router model specifically for query and document modules, allowing convenient usage via `model.encode_query`
        and `model.encode_document`.

        Args:
            query_modules: List of modules to be applied for the "query" task type.
            document_modules: List of modules to be applied for the "document" task type.
            default_route: The default route to use if no task type is specified. If None, an exception will be thrown
                if no task type is specified. If ``allow_empty_key`` is True, the first key in sub_modules will be used as
                the default route. Defaults to None.
            allow_empty_key: If True, allows the default route to be set to the first key in `sub_modules` if
                ``default_route`` is None. Defaults to True.

        Returns:
            Router: An instance of the Router model with the specified query and document modules.
        )querydocumentr'   )r   r   r    )clsquery_modulesdocument_modulesr   r   s        r$   for_query_documentzRouter.for_query_document   s%    0 "/=MN'5:+
 	
    c           	        ||j                  d| j                        }|"| j                  rt        d      t        d      || j                  vr2t        d| dt        | j                  j                                      ||d<   | j                  |   D ]H  }|j                         D ci c]"  \  }}t        |d      r||j                  v r||$ }}} ||fi |}J |S c c}}w )Nr   vYou must provide a `router_mapping` argument on the training arguments, or set a default route in the `Router` module.kYou must provide a `task` argument when calling this method, or set a default route in the `Router` module.No route found for task type ''. Available routes: forward_kwargs)
getr   trainingr   r   r   r   r   hasattrr3   )r    featuresr   kwargsmodulekeyvaluemodule_kwargss           r$   forwardzRouter.forward   s   <<<(:(:;D<}} E 
 !E 
 t'''06KDQUQaQaQfQfQhLiKjk  v&&t, 	9F #),,.C6#34@U@U9U U
M 
 h8-8H	9 s   -'C$c                    | j                   j                         D ]2  }t        |      D ]"  }t        |d      s|j	                         c c S  4 y )N get_sentence_embedding_dimension)r   valuesreversedr6   r?   )r    r   r9   s      r$   r?   z'Router.get_sentence_embedding_dimension   sW    ++224 	EK";/ E6#EF!BBDDE	E r-   c           	     V   i }i }i }| j                   j                         D ]  \  }}g ||<   t        |      D ]k  \  }	}
| d|	 dt        |
      j                   }|
||<   t        |
      j
                   dt        |
      j                   ||<   ||   j                  |       m  |j                         D ][  \  }}
t        j                  j                  |t        |            }t        j                  |d       	  |
j                  |fd|i| ] t        t        j                  j                  || j                        dd      5 }t!        j"                  ||| j%                         d	|d
       d d d        y # t        $ r |
j                  |       Y w xY w# 1 sw Y   y xY w)N_.T)exist_oksafe_serializationwutf8)encoding)types	structure
parameters   )indent)r   r   	enumeratetype__name__
__module__appendospathjoinstrmakedirssave	TypeErroropenconfig_file_namejsondumpget_config_dict)r    output_pathrF   r8   model_lookupmodel_typesmodel_structurenamemodels
module_idxmodelmodel_id
model_pathfOuts                 r$   rY   zRouter.save   s    ,,224 	7LD&$&OD!%.v%6 7!
E"V1ZL$u+2F2F1GH).X&+/;+A+A*B!DKDXDXCY(ZH%%,,X6	7	7  ,113 	'OHek3x=AJKK
T2'

:W:LWPVW		' "'',,{D,A,ABCRXY 		]aII(!0"&"6"6"8
 		 			  '

:&'		 		s   >E?+F?FFF(c           	        t        |d   t              rO|;t        d |D              }t        |      dkD  rt	        d      |j                         }|D cg c]  }||   	 }}|| j                  }|"| j                  rt	        d      t	        d      || j                  vr2t	        d| dt        | j                  j                                      | j                  |   d   } |j                  |fi |}||d	<   |S c c}w )
z-Tokenizes a text and maps tokens to token-idsr   c              3  J   K   | ]  }|j                         D ]  }|   y wN)r   ).0textr:   s      r$   	<genexpr>z"Router.tokenize.<locals>.<genexpr>$  s"     IDTYY[IcCICIs   !#   zYou cannot pass a list of dictionaries with different task types. Please ensure all dictionaries have the same task type key, or pass a single `task` argument.r/   r0   r1   r2   r   )
isinstancedictsetr   r   popr   r5   r   r   r   tokenize)r    textsr   r8   tasksro   input_module	tokenizeds           r$   rv   zRouter.tokenize  s4   eAh%|IEIIu:>$x  yy{ -22DT$Z2E2<%%D<}} E 
 !E  t'''06KDQUQaQaQfQfQhLiKjk  ''-a0)L))%:6:	 	&/ 3s   Dc           
     L   ||||d} | j                   d	||d|}	|	s | j                   d	|d|d|}	i }
|	d   j                         D ]E  \  }}t        |      }	  |j                  |fdt	        ||      j                         i||}||
|<   G i }|	d   j                         D ](  \  }}g ||<   |D ]  }||   j                  |
|           *  | |fi |	d   }|S # t        $ r: t        d	|t	        ||      j                         d|}|j                  |      }Y w xY w)
N)tokencache_folderrevisionlocal_files_only)model_name_or_path	subfolderzconfig.json)r   config_filenamer   rJ   r   rK   rL   r(   )	load_configr   r   loadr   as_posixrZ   r   rS   )r)   r   r   r|   r}   r~   r   r8   
hub_kwargsconfigr"   rh   
model_typemodule_classr9   
local_pathrc   key_namemodels_listrg   s                       r$   r   zRouter.loadF  s    (  0	

 !j4FR[j_ij$S__ #5}`imwF $*7O$9$9$; 	' Hj#5j#AL7***&26y(2K2T2T2VZdhn !'GH	' %+K%8%>%>%@ 	D!Hk(*OH%' D)001BCD	D
 O<vl';<  7* '9T)U]E^EgEgEimw
 &**:6	7s   /C  A D#"D#c                    | j                   j                         D ].  }|d   }t        |d      s|j                  "|j                  c S  y )Nr   	tokenizer)r   r@   r6   r   )r    r   ry   s      r$   r   zRouter.tokenizert  sQ      ++224 	.K(3AL|[1l6L6L6X#---	. r-   c                >   t               }| j                  j                         D ]2  }|d   }|st        |d      s|j	                  |j
                         4 |sy t        |      dk(  r|j                         S t        j                  d| d       t        |      S )Nr   max_seq_lengthrq   z$Different max_seq_lengths detected: z. Using the maximum value.)rt   r   r@   r6   addr   r   ru   loggerwarning_oncemax)r    max_seq_lengthsr"   ry   s       r$   r   zRouter.max_seq_length}  s     %''..0 	AG(/
L7<1AB##L$?$?@	A
 !Q&"&&(("FFWWq rs''r-   c                   g }| j                   j                         D ])  \  }}|s	t        |d   d      s|j                  |       + t	        |      dk(  rt
        j                  d       y |D ]  }| j                   |   d   }||_         y )Nr   r   z2No modules have a max_seq_length attribute to set.)r   r   r6   rS   r   r   warningr   )r    r;   has_max_seq_length_keysr:   re   ry   s         r$   r   zRouter.max_seq_length  s     #%++113 	4KC'&)-=>'..s3	4 &'1,NNOP* 	0C(,(8(8(=a(@L*/L'	0r-   )NT)r   zdict[str, list[Module]]r   
str | Noner   boolreturnNone)
r*   list[Module]r+   r   r   r   r   r   r   r   rm   )r7   dict[str, Tensor]r   r   r   r   )r   int)T)r`   rW   rF   r   )rw   z!list[str] | list[tuple[str, str]]r   r   ) NNNF)r   rW   r   rW   r|   zbool | str | Noner}   r   r~   r   r   r   r   r   )r   r   )rQ   rR   __qualname__r3   r   __annotations__r\   r   classmethodr,   r=   r?   rY   rv   r   propertyr   r   setter__classcell__)r#   s   @r$   r   r      s`   XN-/@AKA+ os^/2^/CM^/gk^/	^/@ 
 %) $
#
 '
 "	

 
 

 
:<B%N  #'#'#!&++ + !	+
 !+ + + 
+ +Z   ( (" 0 0r-   r   )
__future__r   r]   rT   pathlibr   typingr   ImportErrortyping_extensionstorchr   r   transformers.utilsr   (sentence_transformers.models.InputModuler	   #sentence_transformers.models.Moduler
   sentence_transformers.utilr   r   
get_loggerrQ   r   r   Asymr(   r-   r$   <module>r      sg    "  	 '  & @ 6 H			H	%G0[ G0V q  '&'s   A A,+A,