
    rhT                    (   d dl mZ d dlZd dlZd dlmZmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erd dlmZ  ej2                  e      Zed   ZddddddddddddddZdddddddd d!d"d#d$d%dZ G d& d'e      Zy)(    )annotationsN)TYPE_CHECKINGAnyCallableLiteral)Tensor)tqdm)SentenceTransformer)InformationRetrievalEvaluator)SentenceEvaluator)SimilarityFunction)is_datasets_available)climatefeverdbpediafeverfiqa2018hotpotqamsmarconfcorpusnqquoraretrievalscidocsarguanascifact
touche2020zzeta-alpha-ai/NanoClimateFEVERzzeta-alpha-ai/NanoDBPediazzeta-alpha-ai/NanoFEVERzzeta-alpha-ai/NanoFiQA2018zzeta-alpha-ai/NanoHotpotQAzzeta-alpha-ai/NanoMSMARCOzzeta-alpha-ai/NanoNFCorpuszzeta-alpha-ai/NanoNQz zeta-alpha-ai/NanoQuoraRetrievalzzeta-alpha-ai/NanoSCIDOCSzzeta-alpha-ai/NanoArguAnazzeta-alpha-ai/NanoSciFactzzeta-alpha-ai/NanoTouche2020ClimateFEVERDBPediaFEVERFiQA2018HotpotQAMSMARCONFCorpusNQQuoraRetrievalSCIDOCSArguAnaSciFact
Touche2020c                      e Zd ZdZeZddgdgg dg ddgddddddej                  d	dddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd
Zd Z		 	 	 d	 	 	 	 	 	 	 	 	 ddZ
ddZddZd Zd Z fdZddZ xZS )NanoBEIREvaluatora  
    This class evaluates the performance of a SentenceTransformer Model on the NanoBEIR collection of Information Retrieval datasets.

    The collection is a set of datasets based on the BEIR collection, but with a significantly smaller size, so it can
    be used for quickly evaluating the retrieval performance of a model before committing to a full evaluation.
    The datasets are available on Hugging Face in the `NanoBEIR collection <https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6>`_.
    This evaluator will return the same metrics as the InformationRetrievalEvaluator (i.e., MRR, nDCG, Recall@k), for each dataset and on average.

    Args:
        dataset_names (List[str]): The names of the datasets to evaluate on. Defaults to all datasets.
        mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
        ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
        accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
        precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
        map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
        show_progress_bar (bool): Whether to show a progress bar during evaluation. Defaults to False.
        batch_size (int): The batch size for evaluation. Defaults to 32.
        write_csv (bool): Whether to write the evaluation results to a CSV file. Defaults to True.
        truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
        score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
        main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
        aggregate_fn (Callable[[list[float]], float]): The function to aggregate the scores. Defaults to np.mean.
        aggregate_key (str): The key to use for the aggregated score. Defaults to "mean".
        query_prompts (str | dict[str, str], optional): The prompts to add to the queries. If a string, will add the same prompt to all queries. If a dict, expects that all datasets in dataset_names are keys.
        corpus_prompts (str | dict[str, str], optional): The prompts to add to the corpus. If a string, will add the same prompt to all corpus. If a dict, expects that all datasets in dataset_names are keys.
        write_predictions (bool): Whether to write the predictions to a JSONL file. Defaults to False.
            This can be useful for downstream evaluation as it can be used as input to the :class:`~sentence_transformers.sparse_encoder.evaluation.ReciprocalRankFusionEvaluator` that accept precomputed predictions.

    Example:
        ::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.evaluation import NanoBEIREvaluator

            model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

            datasets = ["QuoraRetrieval", "MSMARCO"]
            query_prompts = {
                "QuoraRetrieval": "Instruct: Given a question, retrieve questions that are semantically equivalent to the given question\nQuery: ",
                "MSMARCO": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
            }

            evaluator = NanoBEIREvaluator(
                dataset_names=datasets,
                query_prompts=query_prompts,
            )

            results = evaluator(model)
            '''
            NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset:
            Evaluating NanoQuoraRetrieval
            Information Retrieval Evaluation of the model on the NanoQuoraRetrieval dataset:
            Queries: 50
            Corpus: 5046

            Score-Function: cosine
            Accuracy@1: 92.00%
            Accuracy@3: 98.00%
            Accuracy@5: 100.00%
            Accuracy@10: 100.00%
            Precision@1: 92.00%
            Precision@3: 40.67%
            Precision@5: 26.00%
            Precision@10: 14.00%
            Recall@1: 81.73%
            Recall@3: 94.20%
            Recall@5: 97.93%
            Recall@10: 100.00%
            MRR@10: 0.9540
            NDCG@10: 0.9597
            MAP@100: 0.9395

            Evaluating NanoMSMARCO
            Information Retrieval Evaluation of the model on the NanoMSMARCO dataset:
            Queries: 50
            Corpus: 5043

            Score-Function: cosine
            Accuracy@1: 40.00%
            Accuracy@3: 74.00%
            Accuracy@5: 78.00%
            Accuracy@10: 88.00%
            Precision@1: 40.00%
            Precision@3: 24.67%
            Precision@5: 15.60%
            Precision@10: 8.80%
            Recall@1: 40.00%
            Recall@3: 74.00%
            Recall@5: 78.00%
            Recall@10: 88.00%
            MRR@10: 0.5849
            NDCG@10: 0.6572
            MAP@100: 0.5892
            Average Queries: 50.0
            Average Corpus: 5044.5

            Aggregated for Score Function: cosine
            Accuracy@1: 66.00%
            Accuracy@3: 86.00%
            Accuracy@5: 89.00%
            Accuracy@10: 94.00%
            Precision@1: 66.00%
            Recall@1: 60.87%
            Precision@3: 32.67%
            Recall@3: 84.10%
            Precision@5: 20.80%
            Recall@5: 87.97%
            Precision@10: 11.40%
            Recall@10: 94.00%
            MRR@10: 0.7694
            NDCG@10: 0.8085
            '''
            print(evaluator.primary_metric)
            # => "NanoBEIR_mean_cosine_ndcg@10"
            print(results[evaluator.primary_metric])
            # => 0.8084508771660436
    N
   )         r+   d   F    Tmeanc                X   t         |           |t        t        j	                               }|| _        || _        || _        |	| _        || _	        || _
        || _        |	| _        || _        |r,t        t        | j                  j	                                     ng | _        || _        |
| _        d| | _        | j                   r"| xj"                  d| j                    z  c_        || _        || _        || _        || _        || _        | j/                          | j1                          ||||||||	|
|||d}t3        | j
                  dd      D cg c]  } | j4                  |fi | c}| _        d| d| _        d	d
g| _        | j=                  | j                         y c c}w )N	NanoBEIR__)mrr_at_k	ndcg_at_kaccuracy_at_kprecision_recall_at_kmap_at_kshow_progress_bar
batch_size	write_csvtruncate_dimscore_functionsmain_score_functionwrite_predictionszLoading NanoBEIR datasetsF)descleaveNanoBEIR_evaluation_z_results.csvepochsteps)super__init__listdataset_name_to_idkeysdataset_namesaggregate_fnaggregate_keyr<   query_promptscorpus_promptsr:   r>   sortedscore_function_namesr?   r=   namer5   r6   r7   r8   r9   _validate_dataset_names_validate_promptsr	   _load_dataset
evaluatorscsv_filecsv_headers_append_csv_headers)selfrK   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   rL   rM   rN   rO   r@   ir_evaluator_kwargsrR   	__class__s                       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/evaluation/NanoBEIREvaluator.pyrG   zNanoBEIREvaluator.__init__   s   ( 	  !3!8!8!:;M*(*"*,!2".Q`F40D0D0I0I0K+L$Mfh!#6 (/	II1T../00I "*%:" $$&  !"*%: !2$"(.#6!2
  T//6QY^_
 Dt;':;

  4M?,O#W-  !:!:;
s   F'c                @   |D ]  }| j                   D ]"  }| j                  j                  | d|        $ | j                  D ]B  }| j                  j                  | d|        | j                  j                  | d|        D | j                  D ]"  }| j                  j                  | d|        $ | j
                  D ]"  }| j                  j                  | d|        $ | j                  D ]"  }| j                  j                  | d|        $  y )Nz
-Accuracy@z-Precision@z-Recall@z-MRR@z-NDCG@z-MAP@)r7   rX   appendr8   r5   r6   r9   )rZ   rQ   
score_nameks       r]   rY   z%NanoBEIREvaluator._append_csv_headers	  sB   . 	AJ'' F  '':,j(DEF // D  '':,k!(EF  '':,hqc(BCD ]] A  '':,eA3(?@A ^^ B  '':,fQC(@AB ]] A  '':,eA3(?@A	A    c                   i }i }|dk7  r|dk(  rd| }	nd| d| d}	nd}	| j                   |	d| j                    dz  }	t        j                  d	| j                   d
|	 d       | j                  J|j
                  |j                  i| _        |j
                  g| _        | j                  | j                         | j                  j                  d      }
t        | j                  d| j                         D ]  }t        j                  d|j                           |||||      }|j                         D ]?  \  }}|j                  d|
      }|d   }||vrg ||<   |||<   ||   j!                  |       A  i }|D ]  }| j#                  ||         ||<    || j$                  rt&        j(                  j+                  || j,                        }t&        j(                  j/                  |      sJt1        |dd      }|j3                  dj+                  | j4                               |j3                  d       nt1        |dd      }||g}| j                  D ]  }| j6                  D ]  }|j!                  || d|            | j8                  D ]4  }|j!                  || d|           |j!                  || d|           6 | j:                  D ]  }|j!                  || d|            | j<                  D ]  }|j!                  || d|            | j>                  D ]  }|j!                  || d|             |j3                  dj+                  tA        tB        |                   |j3                  d       |jE                          | jF                  s| jH                  ftK        | j                  D cg c]!  }||| dtK        | j<                            f# c}d       d   }| dtK        | j<                         | _#        n3| jH                  jL                   dtK        | j<                         | _#        tO        jP                  | j                  D cg c]  }tS        |jT                         c}      }tO        jP                  | j                  D cg c]  }tS        |jV                         c}      }t        j                  d |        t        j                  d!| d       | j                  D ]  }t        j                  d"|        | j6                  D ]2  }t        j                  d#jY                  ||| d|    d$z               4 | j8                  D ]b  }t        j                  d%jY                  ||| d|    d$z               t        j                  d&jY                  ||| d|    d$z               d | j:                  D ]/  }t        j                  d'jY                  ||| d|                 1 | j<                  D ]/  }t        j                  d(jY                  ||| d|                 1 | j>                  D ]/  }t        j                  d)jY                  ||| d|                 1  | j[                  || j                        }| j]                  ||||       |j_                  |       |S c c}w c c}w c c}w )*Nz after epoch z
 in epoch z after z steps z (truncated to )z$NanoBEIR Evaluation of the model on z dataset:r4   zEvaluating datasets)rA   disablezEvaluating )maxsplitwzutf-8)modeencoding,
az
_accuracy@z_precision@z_recall@z_mrr@z_ndcg@z_map@c                    | d   S Nr,    )xs    r]   <lambda>z,NanoBEIREvaluator.__call__.<locals>.<lambda>j  s
    !A$ rb   )keyr   zAverage Queries: zAverage Corpus: zAggregated for Score Function: zAccuracy@{}: {:.2f}%r/   zPrecision@{}: {:.2f}%zRecall@{}: {:.2f}%zMRR@{}: {:.4f}zNDCG@{}: {:.4f}zMAP@{}: {:.4f})0r=   loggerinforK   r>   similarity_fn_name
similarityrQ   rY   rR   countr	   rV   r:   itemssplitr_   rL   r<   ospathjoinrW   isfileopenwriterX   r7   r8   r5   r6   r9   mapstrcloseprimary_metricr?   maxvaluenpr1   lenqueriescorpusformatprefix_name_to_metrics store_metrics_in_model_card_dataupdate)rZ   modeloutput_pathrD   rE   argskwargsper_metric_resultsper_dataset_resultsout_txtnum_underscores_in_name	evaluator
evaluationfull_keymetric_valuesplitsmetricagg_resultscsv_pathfOutoutput_datarR   ra   score_functionavg_queries
avg_corpuss                             r]   __call__zNanoBEIREvaluator.__call__  s      B;{)%1&ugWUG6BG():):(;1==G:4;M;M:NhW^V__`ab'$)$<$<e>N>N#OD ).)A)A(BD%$$T%>%>?"&))//#"6doo4IW[WmWmSmn 		@IKK+inn%567"5+ueDJ*4*:*:*< @&,!6MN!3313&v.0<#H-"6*11,?@		@ ( 	PF"&"3"34Fv4N"OK	P "t~~ww||K?H77>>(+H3A

388D$4$456

4  H3A %.K11 G++ LA&&{dV:aS3I'JKL 33 JA&&{dV;qc3J'KL&&{dV8A33G'HIJ  GA&&{dV53D'EFG  HA&&{dV6!3E'FGH  GA&&{dV53D'EFGG" JJsxxC 567JJtJJL""''/!$[_[t[tuSWdK4&s4>>7J6K(LMNu&" " *8(8s4>>?R>S&T#)-)A)A)G)G(HsSWSaSaObNc&d#ggtW)s9#4#45WXWWT__U	c)"2"23UV
'}56&zl"56-- 	XDKK9$@A'' i299![D6Q[\][^I_=`cf=fghi // e3::1kTFR]^_]`Ja>beh>hij077;$xXYWZG[;\_b;bcde ]] X,33A{dV5QRPSCT7UVWX ^^ Z-44QtfFSTRUDV8WXYZ ]] X,33A{dV5QRPSCT7UVWX	X$ 11+tyyI--e[%O"";/""G v XUs   	&[,[,[$c                v    dt         |j                             }| j                  |d| j                   z  }|S )NNanor4   )dataset_name_to_human_readablelowerr=   )rZ   dataset_namehuman_readable_names      r]   _get_human_readable_namez*NanoBEIREvaluator._get_human_readable_name  sJ     $%CLDVDVDX%Y$Z[(Qt'8'8&9#::""rb   c                   t               st        d      ddlm} t        |j                            } ||dd      } ||dd      } ||dd      }|D ci c]  }t        |d	         dkD  s|d
   |d	    }	}|D ci c]  }t        |d	         dkD  s|d
   |d	    }
}i }|D ]3  }|d   |vrt               ||d   <   ||d      j                  |d          5 | j                  | j                  j                  |d       |d<   | j                  | j                  j                  |d       |d<   | j                  |      } | j                  d|
|	||d|S c c}w c c}w )Nzedatasets is not available. Please install it to use the NanoBEIREvaluator via `pip install datasets`.r   )load_datasetr   train)r|   r   qrelstext_idzquery-idz	corpus-idquery_promptcorpus_prompt)r   r   relevant_docsrR   rr   )r   
ValueErrordatasetsr   rI   r   r   setaddrN   getrO   r   information_retrieval_class)rZ   r   r[   r   dataset_pathr   r   r   samplecorpus_dictqueries_dict
qrels_dictr   s                r]   rU   zNanoBEIREvaluator._load_dataset  s   $&w  	*),*<*<*>?lHGD|YgF\7'BCIeSQWX^Q_M`cdMdve}fVn4eeDKg&sSYZ`SaObefOfuvf~5gg
 	DFj!314
6*-.vj)*..vk/BC	D
 )262D2D2H2HW[2\/*373F3F3J3J<Y]3^0";;LI/t// 
 $$	

 "
 	
 fgs   E/E E!E!c           	        t        | j                        dk(  rt        d      | j                  D cg c]  }|j                         t        vs| c}x}r,t        d| dt        t        j                                      y c c}w )Nr   zDdataset_names cannot be empty. Use None to evaluate on all datasets.zDataset(s) z@ not found in the NanoBEIR collection. Valid dataset names are: )r   rK   r   r   rI   rH   rJ   )rZ   r   missing_datasetss      r]   rS   z)NanoBEIREvaluator._validate_dataset_names  s    t!!"a'cdd-1-?-? 
)<CUCUCW_qCqL 
 
 
 ./ 0,,01C1H1H1J,K+LN 
  
s   BBc                z   d}| j                   yt        | j                   t              r+| j                  D ci c]  }|| j                    c}| _         n4| j                  D cg c]  }|| j                   vs| c}x}r	|d| dz  }| j                  yt        | j                  t              r+| j                  D ci c]  }|| j                   c}| _        n4| j                  D cg c]  }|| j                  vs| c}x}r	|d| dz  }|rt        |j                               y c c}w c c}w c c}w c c}w )Nre   z2The following datasets are missing query prompts: rn   z3The following datasets are missing corpus prompts: )rN   
isinstancer   rK   rO   r   strip)rZ   	error_msgr   missing_query_promptsmissing_corpus_promptss        r]   rT   z#NanoBEIREvaluator._validate_prompts  sQ   	)$,,c2[_[m[m%n<lD4F4F&F%n"151C1C+!-|[_[m[mGm+ &  QRgQhhjkk	*$--s3]a]o]o&p\|T5H5H'H&p#151C1C,!-|[_[n[nGn, '  RSiRjjlmm	Y__.//  &o+ 'q,s#   D)"D.6D.<D3'D8;D8c                V    t        | j                        dkD  rt        |   |i | y y rq   )r   rK   rF   r   )rZ   r   r   r\   s      r]   r   z2NanoBEIREvaluator.store_metrics_in_model_card_data  s/     t!!"Q&G4dEfE 'rb   c                p    d| j                   i}g d}|D ]  }t        | |      t        | |      ||<     |S )NrK   )r=   rN   rO   )rK   getattr)rZ   config_dictconfig_dict_candidate_keysru   s       r]   get_config_dictz!NanoBEIREvaluator.get_config_dict  sM    &(:(:;%X"- 	6CtS!-#*4#5C 	6 rb   )"rK   zlist[DatasetNameType] | Noner5   	list[int]r6   r   r7   r   r8   r   r9   r   r:   boolr;   intr<   r   r=   z
int | Noner>   z4dict[str, Callable[[Tensor, Tensor], Tensor]] | Noner?   zstr | SimilarityFunction | NonerL   zCallable[[list[float]], float]rM   r   rN   str | dict[str, str] | NonerO   r   r@   r   )Nrd   rd   )
r   r
   r   z
str | NonerD   r   rE   r   returnzdict[str, float])r   DatasetNameTyper   r   )r   r   r   r   )r   zdict[str, Any])__name__
__module____qualname____doc__r   r   r   r1   rG   rY   r   r   rU   rS   rT   r   r   __classcell__)r\   s   @r]   r*   r*   H   s   tl #@ 7;!d "t#0+8"e"'#'PT?C79ww#596:"'%F<3F< F< 	F<
 !F<  )F< F<  F< F< F< !F< NF< =F< 5F< F<  3!F<" 4#F<$  %F<PA* #'q#"q#  q# 	q#
 q# 
q#f#
@	0*Frb   r*   ) 
__future__r   loggingr}   typingr   r   r   r   numpyr   torchr   r	   sentence_transformersr
   >sentence_transformers.evaluation.InformationRetrievalEvaluatorr   2sentence_transformers.evaluation.SentenceEvaluatorr   *sentence_transformers.similarity_functionsr   sentence_transformers.utilr   )sentence_transformers.SentenceTransformer	getLoggerr   rv   r   rI   r   r*   rr   rb   r]   <module>r      s    "  	 8 8    5 h P I <M			8	$$ 5*&,,*,
 8***0 " #
&" "Y) Yrb   