
    rh                        d dl mZ d dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
 er
d dlZd dlmZ  ej                  e      Z G d de
      Zy)    )annotationsN)TYPE_CHECKING)SentenceEvaluator)SentenceTransformerc                       e Zd ZdZ	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 d	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZedd       Z xZ	S )MSEEvaluatorFromDataFrameu  
    Computes the mean squared error (x100) between the computed sentence embedding and some target sentence embedding.

    Args:
        dataframe (List[Dict[str, str]]): It must have the following format. Rows contains different, parallel sentences.
            Columns are the respective language codes::

            [{'en': 'My sentence in English', 'es': 'Oración en español', 'fr': 'Phrase en français'...},
             {'en': 'My second sentence', ...}]
        teacher_model (SentenceTransformer): The teacher model used to compute the sentence embeddings.
        combinations (List[Tuple[str, str]]): Must be of the format ``[('en', 'es'), ('en', 'fr'), ...]``.
            First entry in a tuple is the source language. The sentence in the respective language will be fetched from
            the dataframe and passed to the teacher model. Second entry in a tuple the the target language. Sentence
            will be fetched from the dataframe and passed to the student model
        batch_size (int, optional): The batch size to compute sentence embeddings. Defaults to 8.
        name (str, optional): The name of the evaluator. Defaults to "".
        write_csv (bool, optional): Whether to write the results to a CSV file. Defaults to True.
        truncate_dim (Optional[int], optional): The dimension to truncate sentence embeddings to. If None, uses the model's
            current truncation dimension. Defaults to None.
    c                   t         |           || _        || _        || _        |rd|z   }d|z   dz   | _        ddg| _        d| _        || _        || _	        i | _
        t        j                  d       t               }| j                  D ]  \  }	}
g }g }|D ]l  }||	   j                         dk7  s||
   j                         dk7  s1|j                  ||	          |j!                  ||	          |j!                  ||
          n ||f| j                  |	|
f<   | j                  j!                  |	 d	|
         t#        |      }| j%                  ||      }t'        ||      D ci c]  \  }}||
 c}}| _        y c c}}w )
N_mse_evaluationz_results.csvepochstepsnegative_msezCompute teacher embeddings -)super__init__combinationsname
batch_sizecsv_filecsv_headersprimary_metric	write_csvtruncate_dimdataloggerinfosetstripaddappendlistembed_inputszipteacher_embeddings)self	dataframeteacher_modelr   r   r   r   r   all_source_sentencessrc_langtrg_langsrc_sentencestrg_sentencesrowall_src_embeddingssentemb	__class__s                    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.pyr   z"MSEEvaluatorFromDataFrame.__init__*   s    	(	$:D(4/.@#W-,"(	01"u"&"3"3 	>HhMM  8x=&&(B.3x=3F3F3HB3N(,,S];!((X7!((X7	8 0=m.LDIIx*+##xj($<=	>  $$89!..}>RS>ABVXj>k"ls49"l"ls   %E<c           
     D   |j                          g }| j                  D ]  \  }}| j                  ||f   \  }}	t        j                  |D 
cg c]  }
| j
                  |
    c}
      }t        j                  | j                  ||	            }||z
  dz  j                         }|dz  }|j                  |       t        j                  d| j                   d| d| d       t        j                  d|d        || j                  rt        j                  j                  || j                         }t        j                  j#                  |      }t%        |d	|rd
ndd      5 }t'        j(                  |      }|s|j+                  | j,                         |j+                  ||g|z          d d d        dt        j                  |      j/                          i}| j1                  || j                        }| j3                  ||||       |S c c}
w # 1 sw Y   fxY w)N   d   zMSE evaluation on z dataset - r   :zMSE (*100):	4fr   awzutf-8)newlinemodeencodingr   )evalr   r   npasarrayr%   r#   meanr!   r   r   r   r   ospathjoinr   isfileopencsvwriterwriterowr   itemprefix_name_to_metrics store_metrics_in_model_card_data)r&   modeloutput_pathr   r   
mse_scoresr*   r+   r,   r-   r0   src_embeddingstrg_embeddingsmsecsv_pathoutput_file_existsfrH   metricss                      r3   __call__z"MSEEvaluatorFromDataFrame.__call__W   s    	


"&"3"3 	2Hh+/99h5I+J(M=ZZS`(a4)@)@)F(abNZZ(9(9%(OPN"^39??AC3JCc"KK,TYYK{8*AhZWXYZKK-Bx01	2 "t~~ww||K?H!#!9h8JPS^ef =jkA)OOD$4$45 ;<= "BGGJ$7$<$<$>#>?--gtyyA--eWeUK1 )b= =s   H
(A	HHc                X     |j                   |f| j                  d| j                  d|S )NT)r   convert_to_numpyr   )encoder   r   )r&   rM   	sentenceskwargss       r3   r#   z&MSEEvaluatorFromDataFrame.embed_inputsz   s<     u||
!**	

 
 	
    c                     y)NzKnowledge Distillation )r&   s    r3   descriptionz%MSEEvaluatorFromDataFrame.description   s    'r]   )   r   TN)r'   zlist[dict[str, str]]r(   r   r   zlist[tuple[str, str]]r   intr   strr   boolr   z
int | None)Nre   )
rM   r   rN   z
str | Noner   rb   r   rb   returnzdict[str, float])rM   r   r[   zstr | list[str] | np.ndarrayrf   z
np.ndarray)rf   rc   )
__name__
__module____qualname____doc__r   rW   r#   propertyr`   __classcell__)r2   s   @r3   r   r      s    4 #'+m'+m ++m ,	+m
 +m +m +m !+m\ ik!(!7A!QT!be!	!F
"
 0

 

 ( (r]   r   )
__future__r   rG   loggingrB   typingr   numpyr?   2sentence_transformers.evaluation.SentenceEvaluatorr   )sentence_transformers.SentenceTransformerr   	getLoggerrg   r   r   r_   r]   r3   <module>rt      sD    " 
  	    PM			8	$v( 1 v(r]   