
    rh'9                        d dl mZ d dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
mZ d dlmZ d dlmZ erd dlmZ  ej"                  e      Z G d d	e      Zy)
    )annotationsN)TYPE_CHECKING)average_precision_score
ndcg_score)tqdm)SentenceEvaluator)CrossEncoderc                  |     e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 d	 	 	 	 	 	 	 	 	 d	dZd Zd Z xZS )
CrossEncoderRerankingEvaluatora  
    This class evaluates a CrossEncoder model for the task of re-ranking.

    Given a query and a list of documents, it computes the score [query, doc_i] for all possible
    documents and sorts them in decreasing order. Then, MRR@10, NDCG@10 and MAP are computed to measure the quality of the ranking.

    The evaluator expects a list of samples. Each sample is a dictionary with the mandatory "query" and "positive" keys,
    and either a "negative" or a "documents" key. The "query" is the search query, the "positive" is a list of relevant
    documents, and the "negative" is a list of irrelevant documents. Alternatively, the "documents" key can be used to
    provide a list of all documents, including the positive ones. In this case, the evaluator will assume that the list
    is already ranked by similarity, with the most similar documents first, and will report both the reranking performance
    as well as the performance before reranking. This can be useful to measure the improvement of the reranking on
    top of a first-stage retrieval (e.g. a SentenceTransformer model).

    Note that the maximum score is 1.0 by default, because all positive documents are included in the ranking. This
    can be toggled off by using samples with ``documents`` instead of ``negative``, i.e. ranked lists of all documents
    including the positive ones, together with ``always_rerank_positives=False``. ``always_rerank_positives=False`` only
    works when using ``documents`` instead of ``negative``.

    Args:
        samples (list): A list of dictionaries, where each dictionary represents a sample and has the following keys:
            - 'query' (mandatory): The search query.
            - 'positive' (mandatory): A list of positive (relevant) documents.
            - 'negative' (optional): A list of negative (irrelevant) documents. Mutually exclusive with 'documents'.
            - 'documents' (optional): A list of all documents, including the positive ones. This list is assumed to be
                ranked by similarity, with the most similar documents first. Mutually exclusive with 'negative'.
        at_k (int, optional): Only consider the top k most similar documents to each query for the evaluation. Defaults to 10.
        always_rerank_positives (bool): If True, always evaluate with all positives included. If False, only include
            the positives that are already in the documents list. Always set to True if your ``samples`` contain ``negative``
            instead of ``documents``. When using ``documents``, setting this to True will result in a more useful evaluation
            signal, but setting it to False will result in a more realistic evaluation. Defaults to True.
        name (str, optional): Name of the evaluator, used for logging, saving in a CSV, and the model card. Defaults to "".
        batch_size (int): Batch size to compute sentence embeddings. Defaults to 64.
        show_progress_bar (bool): Show progress bar when computing embeddings. Defaults to False.
        write_csv (bool): Write results to CSV file. Defaults to True.
        mrr_at_k (Optional[int], optional): Deprecated parameter. Please use `at_k` instead. Defaults to None.

    Example:
        ::

            from sentence_transformers import CrossEncoder
            from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
            from datasets import load_dataset

            # Load a model
            model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")

            # Load a dataset with queries, positives, and negatives
            eval_dataset = load_dataset("microsoft/ms_marco", "v1.1", split="validation")

            samples = [
                {
                    "query": sample["query"],
                    "positive": [text for is_selected, text in zip(sample["passages"]["is_selected"], sample["passages"]["passage_text"]) if is_selected],
                    "documents": sample["passages"]["passage_text"],
                    # or
                    # "negative": [text for is_selected, text in zip(sample["passages"]["is_selected"], sample["passages"]["passage_text"]) if not is_selected],
                }
                for sample in eval_dataset
            ]

            # Initialize the evaluator
            reranking_evaluator = CrossEncoderRerankingEvaluator(
                samples=samples,
                name="ms-marco-dev",
                show_progress_bar=True,
            )
            results = reranking_evaluator(model)
            '''
            CrossEncoderRerankingEvaluator: Evaluating the model on the ms-marco-dev dataset:
            Queries: 10047    Positives: Min 0.0, Mean 1.1, Max 5.0   Negatives: Min 1.0, Mean 7.1, Max 10.0
                     Base  -> Reranked
            MAP:     34.03 -> 62.36
            MRR@10:  34.67 -> 62.96
            NDCG@10: 49.05 -> 71.05
            '''
            print(reranking_evaluator.primary_metric)
            # => ms-marco-dev_ndcg@10
            print(results[reranking_evaluator.primary_metric])
            # => 0.7104656857184184
    c	                   t         	|           || _        |!t        j	                  d| d       || _        n|| _        || _        || _        || _        || _	        t        | j                  t              r(t        | j                  j                               | _        d|rd|z   ndz   d| j
                   dz   | _        dd	d
d| j
                   d| j
                   g| _        || _        d| j
                   | _        y )Nz?The `mrr_at_k` parameter has been deprecated; please use `at_k=z
` instead.r   _ z
_results_@z.csvepochstepsMAPMRR@NDCG@ndcg@)super__init__samplesloggerwarningat_kalways_rerank_positivesname
batch_sizeshow_progress_bar
isinstancedictlistvaluescsv_filecsv_headers	write_csvprimary_metric)
selfr   r   r   r   r   r   r%   mrr_at_k	__class__s
            /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/cross_encoder/evaluation/reranking.pyr   z'CrossEncoderRerankingEvaluator.__init__g   s     	NN\]e\ffpqr DIDI'>$	$!2dllD) 3 3 56DL8$C$JTVW\fgkgpgpfqquZvv#WetDII;5G5QUQZQZP[I\]" %dii[1    c                   |dk7  r|dk(  rd| }nd| d| d}nd}t         j                  d| j                   d| d	       g }g }g }g }	g }
g }d
}g }g }t        | j                  d| j
                   d      D ]	  }d|vrt        d      d|vrt        d      d|v rd|v sd|vrd|vrt        d      |d   }|d   }t        |t              r|g}|j                  dd       }|j                  dd       }|r,|D cg c]  }t        ||v        }}t        |      d
k(  rd\  }}}n]|dgt        |      t        |      z
  z  z  }t        j                  t        t        |      d
d            }| j!                  ||      \  }}}|j#                  |       |j#                  |       |j#                  |       | j$                  rD||D cg c]	  }||vs| c}z   }dgt        |      z  d
gt        |      t        |      z
  z  z   }nA|}|D cg c]  }t        ||v        }}n$||z   }dgt        |      z  d
gt        |      z  z   }|dz  }|j#                  t        |             |j#                  t        |      t        |      z
         t        |      d
k(  r5|	j#                  d
       |
j#                  d
       |j#                  d
       Y|D cg c]  }||g }}|j'                  |dd      }t        |      t        |      z
  x}r*t        j(                  |t        j*                  |      g      }| j!                  ||      \  } }!}"|	j#                  |        |
j#                  |!       |j#                  |"        t        j,                  |	      }#t        j,                  |
      }$t        j,                  |      }%d|%d| j.                   |#d| j.                   |$i}&t         j                  d| dt        j0                  |      ddt        j,                  |      dd t        j2                  |      dd!t        j0                  |      ddt        j,                  |      dd t        j2                  |      d       rt        j,                  |      }'t        j,                  |      }(t        j,                  |      })d"|)d#| j.                   |'d$| j.                   |(i}*t         j                  d%t        t        | j.                              z   d&       t         j                  d'd%t        t        | j.                              z   d(|)d)z  d*d+|%d)z  d*       t         j                  d,| j.                   d-|'d)z  d*d+|#d)z  d*       t         j                  d.| j.                   d/|(d)z  d*d+|$d)z  d*       d|%d0d1|%|)z
  d2d3d| j.                   |#d0d1|#|'z
  d2d3d| j.                   |$d0d1|$|(z
  d2d3i}+| j5                  |+| j                        }+| j7                  ||+||       |&j9                  |*       | j5                  |&| j                        }&nt         j                  d'd%t        t        | j.                              z   d(|%d)z  d*       t         j                  d,| j.                   d-|#d)z  d*       t         j                  d.| j.                   d/|$d)z  d*       | j5                  |&| j                        }&| j7                  ||&||       || j:                  rt<        j>                  jA                  || jB                        },t<        j>                  jE                  |,      }-tG        |,|-rd4nd5d67      5 }.tI        jJ                  |.      }/|-s|/jM                  | jN                         |/jM                  |||%|#|$g       d d d        |&S |&S c c}w c c}w c c}w c c}w # 1 sw Y   |&S xY w)8Nz after epoch z
 in epoch z after z stepsr   z<CrossEncoderRerankingEvaluator: Evaluating the model on the z dataset:r   zEvaluating samplesF)descdisableleavequeryzECrossEncoderRerankingEvaluator requires a 'query' key in each sample.positivezHCrossEncoderRerankingEvaluator requires a 'positive' key in each sample.negative	documentszaCrossEncoderRerankingEvaluator requires exactly one of 'negative' and 'documents' in each sample.)r   r   r      T)convert_to_numpyr   mapzmrr@r   z	Queries: z	Positives: Min z.1fz, Mean z, Max z	Negatives: Min base_mapz	base_mrr@z
base_ndcg@ z       Base  -> RerankedzMAP:z   d   z.2fz -> r   z:  r   z: z.4fz (z+.4f)awzutf-8)modeencoding)(r   infor   r   r   r   
ValueErrorr   strgetintsumlennparrayrangecompute_metricsappendr   predictconcatenatezerosmeanr   minmaxprefix_name_to_metrics store_metrics_in_model_card_dataupdater%   ospathjoinr#   isfileopencsvwriterwriterowr$   )0r'   modeloutput_pathr   r   out_txtbase_mrr_scoresbase_ndcg_scoresbase_ap_scoresall_mrr_scoresall_ndcg_scoresall_ap_scoresnum_queriesnum_positivesnum_negativesinstancer2   r3   r4   r5   samplebase_is_relevantbase_mrr	base_ndcgbase_apbase_pred_scoresdocdocsis_relevantmodel_inputpred_scoresnum_ignored_positivesmrrndcgapmean_mrr	mean_ndcgmean_apmetricsmean_base_mrrmean_base_ndcgmean_base_apbase_metricsmodel_card_metricscsv_pathoutput_file_existsfr\   s0                                                   r*   __call__z'CrossEncoderRerankingEvaluator.__call__   s.    B;{)%1&ugWUG6BGRSWS\S\R]]efmennopqT\\0DRVRhRhNhpuv A	%Hh& !hii) !kllh&;(+B(*{(/J w  W%E
+H(C($:||J5H [$7IJS#TC((:$;#T #T'(A-3:0Hi %s8}sCS?T/T(UU$')xxc:J6KQPR0S'T$373G3GHXZj3k0Hi&&x0 ''	2%%g.//#i&Ws3hCVs&WWD#$#H"5s4y3x=?X8Y"YK$DIR"Sv3v'9#:"SK"S(* cCM1QC#h-4GG1K  X/  [!1C4D!DE;1$%%a(&&q)$$Q'378CE3<8K8--d^c-dK ),K(83{;K(KK$K nnk288DY;Z-[\ 00kJMCr!!#&""4(  $CA	%F 77>*GGO,	''-(7499+DII;
 	} % ff]3C8@VWZ?[[abdbhbhivbwx{a| } ff]3C8@VWZ?[[abdbhbhivbwx{a|~	

 GGO4MWW%56N77>2LLDII;'TYYK(.L
 KK3S^!4455MNOKK$sSTYY%889\C=OPS<TTXY`cfYfgjXklmKK$tyyk]S-@,ET(UX.Y\I]^_KK%		{"^c-A#,Fd9WZ?[^J_`a '#b<)?(EQGtyyk"xnBx-7OPT6UUV$W		{#	#b^9STX8YYZ%["
 "&!<!<=OQUQZQZ![11%9KUTYZNN<(11'499EGKK$sSTYY%889Ws]3<OPQKK$tyykX^C,@ABKK%		{"Y_S,ABC11'499EG11%%O"t~~ww||K?H!#!9h,>SCRYZ N^_A)OOD$4$45w) LMN wM $U 'X #T  9|N s+   >_(
	_-_-_2_7A	_<<`c                    t        j                  |      d d d   }d}t        |d| j                         D ]  \  }}||   sd|dz   z  } n t	        |g|g| j                        }t        ||      }|||fS )Nr-   r   r6   )k)rH   argsort	enumerater   r   r   )	r'   y_truey_predrankingrw   rankindexrx   ry   s	            r*   rK   z.CrossEncoderRerankingEvaluator.compute_metrics  s    **V$TrT*$WQ%;< 	KD%e}4!8n	
 6(VH		:$VV4D"}r+   c                z    d| j                   i}| j                  r d| j                  d   v r| j                  |d<   |S )Nr   r5   r   r   )r   r   r   )r'   config_dicts     r*   get_config_dictz.CrossEncoderRerankingEvaluator.get_config_dict'  sA    DII
 <<K4<<?:595Q5QK12r+   )
   Tr   @   FTN)r   z list[dict[str, str | list[str]]]r   rE   r   boolr   rC   r   rE   r   r   r%   r   r(   z
int | None)Nr-   r-   )
r^   r	   r_   z
str | Noner   rE   r   rE   returnzdict[str, float])	__name__
__module____qualname____doc__r   r   rK   r   __classcell__)r)   s   @r*   r   r      s    Pj (,"'#212 2 "&	2
 2 2  2 2 2B bdQ!Q0:QJMQ[^Q	Qfr+   r   )
__future__r   r[   loggingrV   typingr   numpyrH   sklearn.metricsr   r   r   2sentence_transformers.evaluation.SentenceEvaluatorr   0sentence_transformers.cross_encoder.CrossEncoderr	   	getLoggerr   r   r    r+   r*   <module>r      sG    " 
  	    ?  PM			8	$Y%6 Yr+   