
    rhX                       d dl mZ d dlZd dlZd dlmZmZ d dlZd dl	Z	d dl
m
Z
  ej                  e      Zer	 d dlmZ 	 d dlmZ 	 d dlmZ 	 d dlmZ 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd	Z	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd
Z	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZy# e$ r Y yw xY w# e$ r Y }w xY w# e$ r Y w xY w# e$ r Y w xY w)    )annotationsN)TYPE_CHECKINGAny)tqdm)ElasticsearchQdrantClient)SeismicIndex)
OpenSearchc                	   	 ddl m} ddlm} | j
                  r| j                  t        j                  k7  rt        d      |E|t        d      |j
                  r|j                  t        j                  k7  rt        d       |ddd	i|}d
t        t        j                                }	|j                  |	 i d|j                  |j                  d            i       |j                         }
|
j!                         j#                         j%                         }|
j'                         j#                         j%                         }|j)                  d      }d}g }d}|d   }t+        j,                  |t+        j.                  |      d      }t+        j,                  |t+        j.                  |      d      }t1        t3        |      d      D ]  }||   }||   }|d   || j5                         }||| j5                         }d|j7                  ||      i}|j9                  |       t;        |      |k\  s	||dz
  k(  ss|j=                  |	|t3        ||t;        |      z                |t;        |      z  }g } ||	f}|\  }}	g }t        j                         }t3        | j)                  d            D ]  }| j?                         dk(  r| j                         j!                         d   j#                         j%                         j5                         }| j                         j'                         j#                         j%                         j5                         }n| j                         j!                         d   j#                         j%                         |k(  }| j                         j!                         d   |   j#                         j%                         j5                         }| j                         j'                         |   j#                         j%                         j5                         }|jA                  |	|j7                  ||      |d      jB                  } | D !cg c]  }!|!jD                  |!jF                  d }"}!|j9                  |"        t        j                         |z
  }#|r||#|fS ||#fS # t        $ r t	        d      w xY wc c}!w )a  
    Performs semantic search using sparse embeddings with Qdrant.

    Args:
        query_embeddings: PyTorch COO sparse tensor containing query embeddings
        corpus_embeddings: PyTorch COO sparse tensor containing corpus embeddings
            Only used if corpus_index is None
        corpus_index: Tuple of (QdrantClient, collection_name)
            If provided, uses this existing index for search
        top_k: Number of top results to retrieve
        output_index: Whether to return the Qdrant client and collection name

    Returns:
        A tuple containing:
        - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
        - Time taken for search
        - (Optional) Tuple of (QdrantClient, collection_name) if output_index is True
    r   r   )modelszWPlease install the Qdrant client with `pip install qdrant-client` to use this function.z,Query embeddings must be a sparse COO tensorz9Either corpus_embeddings or corpus_index must be providedz-Corpus embeddings must be a sparse COO tensorurlzhttp://localhost:6333sparse_collection_textF)on_diskindex)collection_namevectors_configsparse_vectors_configi'  left)siderightz#Processing and Upserting embeddingsdesc   )indicesvalues)r   vectorsids)r   querylimitusing	corpus_idscore )$qdrant_clientr	   qdrant_client.httpr   ImportError	is_sparselayouttorch
sparse_coo
ValueErrorinttimecreate_collectionSparseVectorParamsSparseIndexParamscoalescer   cpunumpyr   sizenpsearchsortedaranger   rangetolistSparseVectorappendlenupload_collection
sparse_dimquery_pointspointsidr&   )$query_embeddingscorpus_embeddingscorpus_indextop_koutput_indexkwargsr	   r   clientr   corpusindices_arr
values_arrnum_vectors
batch_sizevectors_batch
insert_idxrow_idsstartsendsistartendvec_indices
vec_valuesvector_dataall_resultssearch_start_timeq_idx	q_indicesq_valuesmasksearch_resultshitformatted_resultssearch_times$                                       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/sparse_encoder/search_engines.pysemantic_search_qdrantrh       s   :u.-
 %%)9)@)@EDTDT)TGHH$XYY **.?.F.F%JZJZ.ZLMM D"9DVD.s499;/?.@A  ./#)6+D+D6KcKclqKcKr+D+s"t 	! 	
 #++-nn&**,224]]_((*002
',,Q/

 a."))K*@vNw		+(>WMeK(/TU 	#A1IEq'C%a.s3::<K#E#.557J!6#6#6{S]#6#^_K  -=!Z/1a3G(($3)j*s=7I*IJ ) 
 c-00
 "!	#$ 0 +FO K		 ',,Q/0 .&&(A-(113;;=a@DDFLLNUUWI'00299;??AGGIPPRH#,,.668;??AGGIURD(113;;=a@FJJLRRT[[]I'00299;DAEEGMMOVVXH  ,,+%%i%I	 - 

 & 	 Saa3366CIIFaa,-)., ))+ 11KK55K''{  usttuj bs   S 7 S'S$c           
        	 ddl m}m} t	        | t
              rt        d | D              st        d      ||t        d      t	        |t
              rt        d |D              st        d       |d$i |}d	t        t        j                                }	|j                  j                  |	
      r|j                  j                  |	
       |j                  j                  |	ddddiddidii       t        |      }
d}t        t!        d|
|      d      D ]  }t#        ||z   |
      }g }t!        ||      D ]v  }t%        ||         }|j'                         D ci c]!  \  }}t)        |      j+                  dd      |# }}}|j-                  |	t)        |      t)        |      |dd       x |j/                  ||        |j                  j1                  |	
       ||	f}|\  }}	g }t        j                         }t!        t        |             D ]  }t%        | |         }|j'                         D ci c]!  \  }}t)        |      j+                  dd      |# }}}g }|j'                         D ]  \  }}|j-                  dd| i |di       ! |d|ddid}|j3                  |	|      }|d    d    D cg c]  }t        |d!         |d"   d# }}|j-                  |        t        j                         |z
  }|r|||fS ||fS # t        $ r t        d      w xY wc c}}w c c}}w c c}w )%a  
    Performs semantic search using sparse embeddings with Elasticsearch.

    Args:
        query_embeddings_decoded: List of query embeddings in format [[("token": value), ...], ...]
            Example: To get this format from a SparseEncoder model::

                model = SparseEncoder('my-sparse-model')
                query_texts = ["your query text"]
                query_embeddings = model.encode(query_texts)
                query_embeddings_decoded = model.decode(query_embeddings)
        corpus_embeddings_decoded: List of corpus embeddings in format [[("token": value), ...], ...]
            Only used if corpus_index is None
            Can be obtained using the same decode method as query embeddings
        corpus_index: Tuple of (Elasticsearch, collection_name)
            If provided, uses this existing index for search
        top_k: Number of top results to retrieve
        output_index: Whether to return the Elasticsearch client and collection name

    Returns:
        A tuple containing:
        - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
        - Time taken for search
        - (Optional) Tuple of (Elasticsearch, collection_name) if output_index is True
    r   )r   helpersz^Please install the Elasticsearch client with `pip install elasticsearch` to use this function.c              3  d   K   | ](  }t        |t              xr t        d  |D               * yw)c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & yw   N
isinstancetupler@   .0ts     rg   	<genexpr>z:semantic_search_elasticsearch.<locals>.<genexpr>.<genexpr>   (     &ZPQz!U';'KA!'K&Z   *,Nrp   listallrs   items     rg   ru   z0semantic_search_elasticsearch.<locals>.<genexpr>   5      A 	4Z3&ZUY&Z#ZZA   .0UQuery embeddings must be a list of lists in the format [[('token', value), ...], ...]AEither corpus_embeddings_decoded or corpus_index must be providedc              3  d   K   | ](  }t        |t              xr t        d  |D               * yw)c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & ywrm   ro   rr   s     rg   ru   z:semantic_search_elasticsearch.<locals>.<genexpr>.<genexpr>   (     *^TU:a+?+OCFaK+O*^rw   Nrx   r{   s     rg   ru   z0semantic_search_elasticsearch.<locals>.<genexpr>   5      F
 tT"^s*^Y]*^'^^F
r~   VCorpus embeddings must be a list of lists in the format [[('token', value), ...], ...]sparse_index_r   mappings
propertiestyperank_featureskeywordtokensrE   r   body  Upserting embeddingsr   ._rE   r   _index_id_sourcerank_featureztokens.)field
saturationboostboolr   )shouldminimum_should_matchr8   r!   hitsr   _scorer$   zhttp://localhost:9200)elasticsearchr   rj   r*   rp   ry   rz   r/   r0   r1   r   existsdeletecreater@   r   r<   mindictitemsstrreplacer?   bulkrefreshsearch)query_embeddings_decodedcorpus_embeddings_decodedrH   rI   rJ   rK   r   rj   es
index_namenum_docsrQ   	start_idxend_idxactionsrW   r   kvr]   r^   r_   query_tokensshould_clausestokenweightr!   resultrd   	formattedrf   s                                  rg   semantic_search_elasticsearchr      s   H
8 .5S A,A > pqq$,`aa3T:# F
1F
 C
 uvv=f=$S%5$67
:::.JJJ/


 "O#  &y1	#	 	 	
 01
eAx<CYZ 	&I)j0(;GG9g. 7:;<BLLN48AqCFNN3,a/  ","1v"%a&&,$	$ LLW%-	&0 	

,J'!NB
K		s345 &4U;<4@4F4F4H
,0AqCFNN3$a'
 

 )//1 	uME6!!>wug=N^`kq3r"st	u&^ef2g)hi%8 X^^dWeflWmnPS3s5z?S]Kn	n9%&" ))+ 11KK55K''E  
l
 	

f6
 os   L %&L/
&L5L;L,c                   	 ddl m}m}m}	 |i }|i } |	       }
t        | t              rt        d | D              st        d      |
|t        d      t        |t              rt        d |D              st        d       |       }t        |      }t        t        |      d	
      D ]  }t        ||         }|j                  t        |      t        j                   t        |j#                               |
      t        j                   t        |j%                               t        j&                                 |j(                  |fi |}t+        j*                         }t        |       }g }g }t        |      D ]  }t        | |         }|j-                  t        j                   t        |j#                               |
             |j-                  t        j                   t        |j%                               t        j&                                d|vrd|d<   d|vrd|d<    |j.                  dt        j                   t        |      |
      |||d|}t1        |d       }|D cg c]&  }|D cg c]  \  }}}t3        |      |d c}}}( }}}}}t+        j*                         |z
  }|r|||fS ||fS # t        $ r t	        d      w xY wc c}}}w c c}}}}w )a  
    Performs semantic search using sparse embeddings with Seismic.

    Args:
        query_embeddings_decoded: List of query embeddings in format [[("token": value), ...], ...]
            Example: To get this format from a SparseEncoder model::

                model = SparseEncoder('my-sparse-model')
                query_texts = ["your query text"]
                query_embeddings = model.encode(query_texts)
                query_embeddings_decoded = model.decode(query_embeddings)
        corpus_embeddings_decoded: List of corpus embeddings in format [[("token": value), ...], ...]
            Only used if corpus_index is None
            Can be obtained using the same decode method as query embeddings
        corpus_index: Tuple of (SeismicIndex, collection_name)
            If provided, uses this existing index for search
        top_k: Number of top results to retrieve
        output_index: Whether to return the SeismicIndex client and collection name
        index_kwargs: Additional arguments for SeismicIndex passed to build_from_dataset,
            such as centroid_fraction, min_cluster_size, summary_energy, nknn, knn_path,
            batched_indexing, or num_threads.
        search_kwargs: Additional arguments for SeismicIndex passed to batch_search,
            such as query_cut, heap_factor, n_knn, sorted, or num_threads.
            Note: query_cut and heap_factor are set to default values if not provided.
    Returns:
        A tuple containing:
        - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
        - Time taken for search
        - (Optional) Tuple of (SeismicIndex, collection_name) if output_index is True
    r   )SeismicDatasetr
   get_seismic_stringzMPlease install Seismic with `pip install pyseismic-lsr` to use this function.c              3  d   K   | ](  }t        |t              xr t        d  |D               * yw)c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & ywrm   ro   rr   s     rg   ru   z4semantic_search_seismic.<locals>.<genexpr>.<genexpr>c  rv   rw   Nrx   r{   s     rg   ru   z*semantic_search_seismic.<locals>.<genexpr>b  r}   r~   r   r   c              3  d   K   | ](  }t        |t              xr t        d  |D               * yw)c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & ywrm   ro   rr   s     rg   ru   z4semantic_search_seismic.<locals>.<genexpr>.<genexpr>m  r   rw   Nrx   r{   s     rg   ru   z*semantic_search_seismic.<locals>.<genexpr>l  r   r~   r   zAdding documents to Seismicr   )dtype	query_cut
   heap_factorgffffff?)queries_idsquery_componentsquery_valuesr   c                $    t        | d   d         S )Nr   )r0   )xs    rg   <lambda>z)semantic_search_seismic.<locals>.<lambda>  s    C!QL     )keyr$   r'   )seismicr   r
   r   r*   rp   ry   rz   r/   r@   r   r<   r   add_documentr   r9   arraykeysr   float32build_from_datasetr1   r?   batch_searchsortedr0   )r   r   rH   rI   rJ   index_kwargssearch_kwargsr   r
   r   string_typedatasetrP   idxr   r^   num_queriesr   r   r_   r   resultsquery_result	query_idxr&   r%   r]   rf   s                               rg   semantic_search_seismicr   +  s   TkLL $&K .5S A,A > pqq$,`aa3T:# F
1F
 C
 uvv !"34 k*1NO 	C3C89F  Cfkkm,K@fmmo.bjjA	 7|66wO,O		./KL {# U4U;<l.?.?.A)B+ VWBHHT,*=*=*?%@

STU
 -'%'k"M)'*m$'l'' HHU;/{C)!
	
 G W"89G
 $  [ggg;V9eYs9~	6gK 
 ))+ 11KK55K''e  kijjkR 	hs#   
K :K+
K$K+
K!$K+
c           	        	 ddl m}m} t	        | t
              rt        d | D              st        d      |x|t        d      t	        |t
              rt        d |D              st        d       |di |}d	t        t        j                                }	|j                  j                  |	
      r|j                  j                  |	
       |j                  j                  |	ddddiddidii       t        |      }
d}t        t!        d|
|      d      D ]n  }t#        ||z   |
      }g }t!        ||      D ]:  }t%        ||         }|j'                  |	t)        |      t)        |      |dd       < |j+                  ||       p |j                  j-                  |	
       ||	f}|\  }}	g }t        j                         }t!        t        |             D ]f  }t%        | |         }|ddd|iiid}|j/                  |	|      }|d   d   D cg c]  }t        |d         |d   d }}|j'                  |       h t        j                         |z
  }|r|||fS ||fS # t        $ r t        d      w xY wc c}w ) a  
    Performs semantic search using sparse embeddings with OpenSearch.

    Args:
        query_embeddings_decoded: List of query embeddings in format [[("token": value), ...], ...]
            Example: To get this format from a SparseEncoder model::

                model = SparseEncoder('my-sparse-model')
                query_texts = ["your query text"]
                query_embeddings = model.encode(query_texts)
                query_embeddings_decoded = model.decode(query_embeddings)
        corpus_embeddings_decoded: List of corpus embeddings in format [[("token": value), ...], ...]
            Only used if corpus_index is None
            Can be obtained using the same decode method as query embeddings
        corpus_index: Tuple of (OpenSearch, collection_name)
            If provided, uses this existing index for search
        top_k: Number of top results to retrieve
        output_index: Whether to return the OpenSearch client and collection name
        vocab: The dict to transform tokens into token ids

    Returns:
        A tuple containing:
        - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
        - Time taken for search
        - (Optional) Tuple of (OpenSearch, collection_name) if output_index is True
    r   )r   rj   z[Please install the OpenSearch client with `pip install opensearch-py` to use this function.c              3  d   K   | ](  }t        |t              xr t        d  |D               * yw)c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & ywrm   ro   rr   s     rg   ru   z7semantic_search_opensearch.<locals>.<genexpr>.<genexpr>  rv   rw   Nrx   r{   s     rg   ru   z-semantic_search_opensearch.<locals>.<genexpr>  r}   r~   r   r   c              3  d   K   | ](  }t        |t              xr t        d  |D               * yw)c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & ywrm   ro   rr   s     rg   ru   z7semantic_search_opensearch.<locals>.<genexpr>.<genexpr>  r   rw   Nrx   r{   s     rg   ru   z-semantic_search_opensearch.<locals>.<genexpr>  r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   neural_sparser   r   r   r   r   r   r$   r   )opensearchpyr   rj   r*   rp   ry   rz   r/   r0   r1   r   r   r   r   r@   r   r<   r   r   r?   r   r   r   r   )r   r   rH   rI   rJ   rK   r   rj   	os_clientr   r   rQ   r   r   r   rW   r   r]   r^   r_   r   r!   r   rd   r   rf   s                             rg   semantic_search_opensearchr     s   J
4 .5S A,A > pqq$,`aa3T:# F
1F
 C
 uvvA&A	$S%5$67
##*#5$$:$6   "O#  &y1	#	 	! 	
 01
eAx<CYZ 	-I)j0(;GG9g. 7:;","1v"%a&&,$	 LLG,%	-( 	!!
!3!:.(IzK		s345 	&4U;</H~WcFd;e)fg!!
!? X^^dWeflWmnPS3s5z?S]Kn	n9%	& ))+ 11KK55K''q  
i
 	

` os   I, J,J)NNr   F)rF   ztorch.TensorrG   ztorch.Tensor | NonerH   ztuple[QdrantClient, str] | NonerI   r0   rJ   r   rK   r   returnz}tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[QdrantClient, str]])r   list[list[tuple[str, float]]]r   $list[list[tuple[str, float]]] | NonerH   z tuple[Elasticsearch, str] | NonerI   r0   rJ   r   rK   r   r   z~tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[Elasticsearch, str]])NNr   FNN)r   r   r   r   rH   ztuple[SeismicIndex, str] | NonerI   r0   rJ   r   r   dict[str, Any] | Noner   r   r   z}tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[SeismicIndex, str]])r   r   r   r   rH   ztuple[OpenSearch, str] | NonerI   r0   rJ   r   rK   r   r   z{tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[OpenSearch, str]])
__future__r   loggingr1   typingr   r   r7   r9   r-   r   	getLogger__name__loggerr   r   r*   r(   r	   r   r
   r   r   rh   r   r   r   r'   r   rg   <module>r      s/   "   %   			8	$/.(+ .248}("}(*}( 2}( 	}(
 }( }(Q}(D GK59H(;H(CH( 3H( 	H(
 H( H(RH(Z GK48*.+/~(;~(C~( 2~( 	~(
 ~( (~( )~(Q~(F GK26(;(C( 0( 	(
 ( (O(y        sG   B2 B= C C 2B:9B:=CCCCCC