
    rhC              
         d dl mZ d dlZd dlZd dlZd dlmZmZ d dlZ	d dl
Z
d dl
mZ d dlmZ ddlmZ ddlmZ  ej$                  e      Zerd d	lmZ d
dddddedddf
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddddef	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZdddef	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 ddZy)    )annotationsN)TYPE_CHECKINGCallable)Tensor)tqdm   )cos_sim)normalize_embeddings)SentenceTransformerF    i  i i  d   c           	     T    | j                  |||d|	|
|      }t        ||||||      S )a@	  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    Args:
        model (SentenceTransformer): SentenceTransformer model for embedding computation
        sentences (List[str]): A list of strings (texts or sentences)
        show_progress_bar (bool, optional): Plotting of a progress bar. Defaults to False.
        batch_size (int, optional): Number of texts that are encoded simultaneously by the model. Defaults to 32.
        query_chunk_size (int, optional): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time). Defaults to 5000.
        corpus_chunk_size (int, optional): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time). Defaults to 100000.
        max_pairs (int, optional): Maximal number of text pairs returned. Defaults to 500000.
        top_k (int, optional): For each sentence, we retrieve up to top_k other sentences. Defaults to 100.
        score_function (Callable[[Tensor, Tensor], Tensor], optional): Function for computing scores. By default, cosine similarity. Defaults to cos_sim.
        truncate_dim (int, optional): The dimension to truncate sentence embeddings to. If None, uses the model's ones. Defaults to None.
        prompt_name (Optional[str], optional): The name of a predefined prompt to use when encoding the sentence.
            It must match a key in the model `prompts` dictionary, which can be set during model initialization
            or loaded from the model configuration.

            Ignored if `prompt` is provided. Defaults to None.

        prompt (Optional[str], optional): A raw prompt string to prepend directly to the input sentence during encoding.

            For instance, `prompt="query: "` transforms the sentence "What is the capital of France?" into:
            "query: What is the capital of France?". Use this to override the prompt logic entirely and supply your own prefix.
            This takes precedence over `prompt_name`. Defaults to None.

    Returns:
        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
    T)show_progress_bar
batch_sizeconvert_to_tensortruncate_dimprompt_nameprompt)query_chunk_sizecorpus_chunk_size	max_pairstop_kscore_function)encodeparaphrase_mining_embeddings)model	sentencesr   r   r   r   r   r   r   r   r   r   
embeddingss                w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/util/retrieval.pyparaphrase_miningr       sN    \ +!  J ()+%     c                   |dz  }t        j                         }d}d}t        dt        |       |      D ])  }	t        dt        |       |      D ]  }
 || |
|
|z    | |	|	|z          }t	        j
                  |t        |t        |d               ddd      \  }}|j                         j                         }|j                         j                         }t        t        |            D ]n  }t        ||         D ][  \  }}|
|z   }|	|z   }||k7  s||   |   |kD  s"|j                  ||   |   ||f       |dz  }||k\  sG|j                         }|d   }] p  , t               }g }|j                         sg|j                         \  }}}t        ||g      \  }}||k7  r-||f|vr'|j                  ||f       |j!                  |||g       |j                         sgt        |d d      }|S )	a  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    Args:
        embeddings (Tensor): A tensor with the embeddings
        query_chunk_size (int): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
        corpus_chunk_size (int): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
        max_pairs (int): Maximal number of text pairs returned.
        top_k (int): For each sentence, we retrieve up to top_k other sentences
        score_function (Callable[[Tensor, Tensor], Tensor]): Function for computing scores. By default, cosine similarity.

    Returns:
        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
    r   r   TFdimlargestsortedc                    | d   S )Nr    xs    r   <lambda>z.paraphrase_mining_embeddings.<locals>.<lambda>   s
    !A$ r!   keyreverse)queuePriorityQueuerangelentorchtopkmincputolist	enumerateputgetsetemptyr'   addappend)r   r   r   r   r   r   pairs	min_score	num_addedcorpus_start_idxquery_start_idxscoresscores_top_k_valuesscores_top_k_idx	query_itr	top_k_idx
corpus_itrijentryadded_pairs
pairs_listscoresorted_isorted_js                            r   r   r   X   s/   0 
QJE !EII!!S_6GH 1$QJ9IJ 	1O#?_?O-OP+.>AR.RSF
 5:JJE3vay>24PU51!1 #6"9"9";"B"B"D/335<<>"3v;/ 1	-67G	7R-S 
1)Iz')3A(:5AAv"5i"@"Ki"W		#6y#A)#LaQR"ST!Q	$	1$)IIKE(-aI
11	116 %KJkkmiikq!#QF^(xXx$8$KOOXx01uh9: kkm 
EJr!   c                     t        | i |S )z8This function is deprecated. Use semantic_search instead)semantic_search)argskwargss     r   information_retrievalrW      s    D+F++r!   
   c                F   t        | t        j                  t        j                  f      rt	        j
                  |       } n%t        | t              rt	        j                  |       } t        | j                        dk(  r| j                  d      } t        |t        j                  t        j                  f      rt	        j
                  |      }n%t        |t              rt	        j                  |      }|j                  | j                  k7  r| j                  |j                        } t        t        |             D cg c]  }g  }}t        dt        |       |      D ]  }t        ||z   t        |             }	| j                  r5t	        j                   ||	| j                        }
| j#                  d|
      }n| ||	 }t        dt        |      |      D ]^  }t        ||z   t        |            }|j                  r5t	        j                   |||j                        }
|j#                  d|
      }n||| } |||      }t	        j$                  |t        |t        |d               ddd      \  }}|j'                         j)                         }|j'                         j)                         }t        t        |            D ]n  }t+        ||   ||         D ]W  \  }}||z   }||z   }t        ||         |k  rt-        j.                  ||   ||f       =t-        j0                  ||   ||f       Y p a  t        t        |            D ]I  }t        t        ||               D ]  }||   |   \  }}||d||   |<    t3        ||   d d	      ||<   K |S c c}w )
a3  
    This function performs by default a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

    Args:
        query_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the query embeddings. Can be a sparse tensor.
        corpus_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the corpus embeddings. Can be a sparse tensor.
        query_chunk_size (int, optional): Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. Defaults to 100.
        corpus_chunk_size (int, optional): Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. Defaults to 500000.
        top_k (int, optional): Retrieve top k matching entries. Defaults to 10.
        score_function (Callable[[:class:`~torch.Tensor`, :class:`~torch.Tensor`], :class:`~torch.Tensor`], optional): Function for computing scores. By default, cosine similarity.

    Returns:
        List[List[Dict[str, Union[int, float]]]]: A list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
    r   r   deviceTFr$   )	corpus_idrP   c                    | d   S )NrP   r)   r*   s    r   r,   z!semantic_search.<locals>.<lambda>   s    \]^e\f r!   r-   )
isinstancenpndarraygenericr4   
from_numpyliststackr3   shape	unsqueezer[   tor2   r6   	is_sparsearangeindex_selectr5   r7   r8   zipheapqheappushheappushpopr'   )query_embeddingscorpus_embeddingsr   r   r   r   _queries_result_listrD   query_end_idxindicesquery_chunkrC   corpus_end_idxcorpus_chunk
cos_scorescos_scores_top_k_valuescos_scores_top_k_idxrH   sub_corpus_idrP   r\   query_iddoc_itrs                           r   rT   rT      s   0 "RZZ$<= ++,<=	$d	+ ;;'78
!!"a'+55a8#bjj"**%=>!,,->?	%t	,!KK(9: #3#:#::+../@/G/GH',S1A-B'CD!2DD C(8$9;KL $]O.>>DT@UV%%ll?MJZJaJabG*777CK*?=IK !&a->)?AR S 	] !14E!EsK\G]^N **,,'7PaPhPhi0==aI01A.Q (\BJ =BJJCs:a='9:4X]=9#%9 '>&A&A&C&J&J&L##7#;#;#=#D#D#F "3z?3 	]	,/0DY0OQhirQs,t ](M5 0= @I.:H.x89EA/9E9;M ))*=h*G%QZI[\]	]%	]$]N #123 vS!4X!>?@ 	^G28<WEE9CLW\5])'2	^ )//B8/LRfpt(uH%	v _ Es   >	Nc                   t        | t        j                        st        j                  |       } t        j                  || j                        }t        |       } g }t        |t        |             }t        t        d|z  d      t        |             }t        t        dt        |       |      d|       D ]  }| |||z    | j                  z  }| j                  j                  dv r||k\  }	|	j                  d      }
|
|k\  }|j                         s]|
|   }
||   }|
j                         }|j                  |d	
      \  }}t!        |
|      D ]'  \  }}|j#                  |d| j%                                ) |j                  |d	
      \  }}t        t        |            D ]  }||   d   |k\  s||   j                  |d	
      \  }}|d   |kD  rV|t        |       k  rHt        d|z  t        |             }||   j                  |d	
      \  }}|d   |kD  r|t        |       k  rH|j#                  |||k\     j%                                  t'        |d d	      }g }t)               }t+        |      D ]U  \  }}g }|D ]  }||vs|j#                  |        t        |      |k\  s4|j#                  |       |j-                  |       W t'        |d d	      }|S )a  
    Function for Fast Community Detection.

    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.

    Args:
        embeddings (torch.Tensor or numpy.ndarray): The input embeddings.
        threshold (float): The threshold for determining if two embeddings are close. Defaults to 0.75.
        min_community_size (int): The minimum size of a community to be considered. Defaults to 10.
        batch_size (int): The batch size for computing cosine similarity scores. Defaults to 1024.
        show_progress_bar (bool): Whether to show a progress bar during computation. Defaults to False.

    Returns:
        List[List[int]]: A list of communities, where each community is represented as a list of indices.
    rZ      2   r   zFinding clusters)descdisable)cudanpur   T)kr&   Nr#   c                    t        |       S Nr3   r*   s    r   r,   z%community_detection.<locals>.<lambda>R  s
    A r!   r-   c                    t        |       S r   r   r*   s    r   r,   z%community_detection.<locals>.<lambda>b  s
    #a& r!   )r^   r4   r   tensorr[   r
   r6   r3   maxr   r2   Ttypesumanyr5   rk   r?   r8   r'   r<   r9   update)r   	thresholdmin_community_sizer   r   extracted_communitiessort_max_size	start_idxrx   threshold_maskrow_wise_countlarge_enough_maskr   rq   top_k_indicescountrt   top_k_valuesrK   top_val_largetop_idx_largeunique_communitiesextracted_ids
cluster_id	communitynon_overlapped_communityidxs                              r   community_detectionr     s.   0 j%,,/\\*-
Yz/@/@AI%j1J /ZAA 22B7ZIMaZ*-4FTePe *e	  	I
,BCjllR
 !!_4'94N+//2N !/2D D$((*+,=>N#$56J ""$A)DAA} #&nm"D Gw%,,WVe_-C-C-EFG )oo0BDoQOL! 3|,- 
e?2&)33=a=3E3E_c3E3d0M= (+i7MCPZO<[(+A,=s:(O7A!}7I7IMcg7I7h4} (+i7MCPZO<[ *00}PY?Y1Z1a1a1cd
eA*eZ ##8>NX\] EM!*+@!A ;
I#%  	5C-'(//4	5 '(,>>%%&>?  !9:;   28HRVWr!   )r   r   r   z	list[str]r   boolr   intr   r   r   r   r   r   r   r   r   "Callable[[Tensor, Tensor], Tensor]r   z
int | Noner   
str | Noner   r   returnlist[list[float | int]])r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   "list[list[dict[str, int | float]]])ro   r   rp   r   r   r   r   r   r   r   r   r   r   r   )g      ?rX   i   F)r   ztorch.Tensor | np.ndarrayr   floatr   r   r   r   r   r   r   zlist[list[int]])
__future__r   rl   loggingr0   typingr   r   numpyr_   r4   r   tqdm.autonotebookr   
similarityr	   r   r
   	getLogger__name__logger)sentence_transformers.SentenceTransformerr   r    r   rW   rT   r   r)   r!   r   <module>r      s   "    *    "  (			8	$M $ #9@#"??? ? 	?
 ? ? ? ? 7? ? ? ? ?H !#9@FFF F 	F
 F 7F FR,  #9@XXX X 	X
 X 7X (Xz  #c)cc c 	c
 c cr!   